@article{,
title= {VQA: Visual Question Answering Dataset},
journal= {},
author= {Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh},
booktitle= {International Conference on Computer Vision (ICCV)},
year= {2015},
url= {http://visualqa.org/},
abstract= {254,721 images, 764,163 questions, 9,934,119 answers!
===What is VQA?
VQA is a new dataset containing open-ended questions about images. These questions require an understanding of vision, language and commonsense knowledge to answer.
Over 250K images (MSCOCO and abstract scenes)
3 questions per image
10 ground truth answers per question
3 plausible (but likely incorrect) answers per question
Open-ended and multiple-choice answering tasks
Automatic evaluation metric
===Overview
For every image, we collected 3 free-form natural-language questions with 10 concise open-ended answers each. We provide two formats of the VQA task: open-ended and multiple-choice. For additional details, please see the VQA paper.
The annotations we release are the result of the following post-processing steps on the raw crowdsourced data:
Spelling correction (using Bing Speller) of question and answer strings
Question normalization (first char uppercase, last char ???)
Answer normalization (all chars lowercase, no period except as decimal point, number words ?> digits, strip articles (a, an the))
Adding apostrophe if a contraction is missing it (e.g., convert "dont" to "don't")
Please follow the instructions in the README to download and setup the VQA data (annotations and images).
==October 2015: Full release (v1.0)
Real Images
204,721 MSCOCO images
(all of current train/val/test)
614,163 questions
6,141,630 ground truth answers
1,842,489 plausible answers
Abstract Scenes
50,000 abstract scenes
150,000 questions
1,500,000 ground truth answers
450,000 plausible answers
250,000 captions
==July 2015: Beta v0.9 release
123,287 MSCOCO images (all of train/val)
369,861 questions
3,698,610 ground truth answers
1,109,583 plausible answers
==June 2015: Beta v0.1 release
10,000 MSCOCO images (from train)
30,000 questions
300,000 ground truth answers
90,000 plausible answers
},
keywords= {deeplearning},
terms= {View the terms here: http://visualqa.org/terms.html}
}