Evrard t'Serstevens commited on
Commit
63c1afb
1 Parent(s): e8fb54a

add section, title, yaml tags and examples

Browse files
Files changed (7) hide show
  1. .eslintcache +1 -1
  2. package-lock.json +5 -0
  3. package.json +2 -0
  4. src/App.js +155 -7
  5. src/InputField.js +4 -2
  6. src/Instructions.js +138 -4
  7. src/Section.js +26 -0
.eslintcache CHANGED
@@ -1 +1 @@
1
- [{"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/index.js":"1","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/reportWebVitals.js":"2","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/App.js":"3","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/Instructions.js":"4","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/InputField.js":"5"},{"size":500,"mtime":1607266233636,"results":"6","hashOfConfig":"7"},{"size":362,"mtime":1607349763851,"results":"8","hashOfConfig":"7"},{"size":9989,"mtime":1607357430038,"results":"9","hashOfConfig":"7"},{"size":12558,"mtime":1607338863269,"results":"10","hashOfConfig":"7"},{"size":612,"mtime":1607350022304,"results":"11","hashOfConfig":"7"},{"filePath":"12","messages":"13","errorCount":0,"warningCount":0,"fixableErrorCount":0,"fixableWarningCount":0,"usedDeprecatedRules":"14"},"d40mw9",{"filePath":"15","messages":"16","errorCount":0,"warningCount":0,"fixableErrorCount":0,"fixableWarningCount":0,"usedDeprecatedRules":"14"},{"filePath":"17","messages":"18","errorCount":0,"warningCount":2,"fixableErrorCount":0,"fixableWarningCount":0,"source":null},{"filePath":"19","messages":"20","errorCount":0,"warningCount":1,"fixableErrorCount":0,"fixableWarningCount":0,"source":"21","usedDeprecatedRules":"14"},{"filePath":"22","messages":"23","errorCount":0,"warningCount":0,"fixableErrorCount":0,"fixableWarningCount":0,"usedDeprecatedRules":"14"},"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/index.js",[],["24","25"],"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/reportWebVitals.js",[],"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/App.js",["26","27"],"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/Instructions.js",["28"],"const NAME = 'Instructions'\n\nexport default {\n name: NAME,\n instructions: {\n homepage: {\n paragraph: [\n \"Add homepage URL here if available (unless it's a GitHub repository)\"\n ],\n },\n repository: {\n paragraph: [\n \"If the dataset is hosted on github or has a github homepage, add URL here\"\n ],\n }, \n paper: {\n paragraph: [\n \"If the dataset was introduced by a paper or there was a paper written describing the dataset, add URL here (landing page for Arxiv paper preferred)\"\n ],\n }, \n leaderboard: {\n paragraph: [\n \"If the dataset supports an active leaderboard, add link here\"\n ],\n }, \n contact: {\n paragraph: [\n \"If known, name and email of at least one person the reader can contact for questions about the dataset.\"\n ],\n }, \n datasetSummary: {\n paragraph: [\n \"Briefly summarize the dataset, its intended use and the supported tasks. Give an overview of how and why the dataset was created. The summary should explicitly mention the languages present in the dataset (possibly in broad terms, e.g. translations between several pairs of European languages), and describe the domain, topic, or genre covered.\"\n ],\n },\n supportedTasks: {\n paragraph: [\n \"For each of the tasks tagged for this dataset, give a brief description of the tag, metrics, and suggested models (with a link to their HuggingFace implementation if available). Give a similar description of tasks that were not covered by the structured tag set (repace the `task-category-tag` with an appropriate `other:other-task-name`).\",\n \"- `task-category-tag`: The dataset can be used to train a model for [TASK NAME], which consists in [TASK DESCRIPTION]. Success on this task is typically measured by achieving a *high/low* [metric name](https://huggingface.co/metrics/metric_name). The ([model name](https://huggingface.co/model_name) or [model class](https://huggingface.co/transformers/model_doc/model_class.html)) model currently achieves the following score. *[IF A LEADERBOARD IS AVAILABLE]:* This task has an active leaderboard which can be found at [leaderboard url]() and ranks models based on [metric name](https://huggingface.co/metrics/metric_name) while also reporting [other metric name](https://huggingface.co/metrics/other_metric_name).\"\n ]\n },\n languages: {\n paragraph: [\n \"Provide a brief overview of the languages represented in the dataset. Describe relevant details about specifics of the language such as whether it is social media text, African American English,...\",\n \"When relevant, please provide [BCP-47 codes](https://tools.ietf.org/html/bcp47), which consist of a [primary language subtag](https://tools.ietf.org/html/bcp47#section-2.2.1), with a [script subtag](https://tools.ietf.org/html/bcp47#section-2.2.3) and/or [region subtag](https://tools.ietf.org/html/bcp47#section-2.2.4) if available.\"\n ]\n },\n dataInstances: {\n paragraph: [\n \"Provide an JSON-formatted example and brief description of a typical instance in the dataset. If available, provide a link to further examples.\",\n `\n {\n 'example_field': ...,\n ...\n }\n `,\n \"Provide any additional information that is not covered in the other sections about the data here. In particular describe any relationships between data points and if these relationships are made explicit.\",\n ]\n },\n dataFields: {\n paragraph: [\n \"List and describe the fields present in the dataset. Mention their data type, and whether they are used as input or output in any of the tasks the dataset currently supports. If the data has span indices, describe their attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc. If the datasets contains example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points.\",\n \"- `example_field`: description of `example_field`\"\n ]\n },\n dataSplits: {\n paragraph: [\n \"Describe and name the splits in the dataset if there are more than one.\",\n \"Describe any criteria for splitting the data, if used. If their are differences between the splits (e.g. if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.\",\n \"Provide the sizes of each split. As appropriate, provide any descriptive statistics for the features, such as average length. For example:\",\n `| | Tain | Valid | Test |\n | ----- | ------ | ----- | ---- |\n | Input Sentences | | | |\n | Average Sentence Length | | | |`\n ]\n },\n curationRationale: {\n paragraph: [\n \"What need motivated the creation of this dataset? What are some of the reasons underlying the major choices involved in putting it together?\",\n ]\n },\n dataCollection: {\n paragraph: [\n \"Describe the data collection process. Describe any criteria for data selection or filtering. List any key words or search terms used. If possible, include runtime information for the collection process.\",\n \"If data was collected from other pre-existing datasets, link to source here and to their [Hugging Face version](https://huggingface.co/datasets/dataset_name).\",\n \"If the data was modified or normalized after being collected (e.g. if the data is word-tokenized), describe the process and the tools used.\"\n ]\n },\n sourceLanguage: {\n paragraph: [\n \"State whether the data was produced by humans or machine generated. Describe the people or systems who originally created the data.\",\n \"If available, include self-reported demographic or identity information for the source data creators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.\",\n \"Describe the conditions under which the data was created (for example, if the producers were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here.\",\n \"Describe other people represented or mentioned in the data. Where possible, link to references for the information.\"\n ]\n },\n annotations: {\n paragraph: [\n \"If the dataset contains annotations which are not part of the initial data collection, describe them in the following paragraphs.\"\n ]\n },\n annotationProcess: {\n paragraph: [\n \"If applicable, describe the annotation process and any tools used, or state otherwise. Describe the amount of data annotated, if not all. Describe or reference annotation guidelines provided to the annotators. If available, provide interannotator statistics. Describe any annotation validation processes.\"\n ]\n },\n annotators: {\n paragraph: [\n \"If annotations were collected for the source data (such as class labels or syntactic parses), state whether the annotations were produced by humans or machine generated.\",\n \"Describe the people or systems who originally created the annotations and their selection criteria if applicable.\",\n \"If available, include self-reported demographic or identity information for the annotators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.\",\n \"Describe the conditions under which the data was annotated (for example, if the annotators were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here.\"\n ]\n },\n personalInformation: {\n paragraph: [\n \"State whether the dataset uses identity categories and, if so, how the information is used. Describe where this information comes from (i.e. self-reporting, collecting from profiles, inferring, etc.). See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender. State whether the data is linked to individuals and whether those individuals can be identified in the dataset, either directly or indirectly (i.e., in combination with other data).\",\n \"State whether the dataset contains other data that might be considered sensitive (e.g., data that reveals racial or ethnic origins, sexual orientations, religious beliefs, political opinions or union memberships, or locations; financial or health data; biometric or genetic data; forms of government identification, such as social security numbers; criminal history).\",\n \"If efforts were made to anonymize the data, describe the anonymization process.\"\n ]\n },\n socialImpact: {\n paragraph: [\n \"Please discuss some of the ways you believe the use of this dataset will impact society.\",\n \"The statement should include both positive outlooks, such as outlining how technologies developed through its use may improve people's lives, and discuss the accompanying risks. These risks may range from making important decisions more opaque to people who are affected by the technology, to reinforcing existing harmful biases (whose specifics should be discussed in the next section), among other considerations.\",\n \"Please also mention in this section if the proposed dataset contains a *low-resource* or under-represented language.\"\n ]\n },\n biasesDiscussion: {\n paragraph: [\n \"Provide descriptions of specific biases that are likely to be reflected in the data, and state whether any steps were taken to reduce their impact.\",\n \"For Wikipedia text, see for example [Dinan et al 2020 on biases in Wikipedia (esp. Table 1)](https://arxiv.org/abs/2005.00614), or [Blodgett et al 2020](https://www.aclweb.org/anthology/2020.acl-main.485/) for a more general discussion of the topic.\",\n \"If analyses have been run quantifying these biases, please add brief summaries and links to the studies here.\"\n ]\n },\n limitations: {\n paragraph: [\n \"If studies of the datasets have outlined other limitations of the dataset, such as annotation artifacts, please outline and cite them here.\"\n ]\n },\n datasetCurators: {\n paragraph: [\n \"List the people involved in collecting the dataset and their affiliation(s). If funding information is known, include it here.\"\n ]\n },\n licensingInformation: {\n paragraph: [\n \"Provide the license and link to the license webpage if available.\"\n ]\n },\n citationInformation: {\n paragraph: [\n \"Provide the [BibTex](http://www.bibtex.org/)-formatted reference for the dataset. For example:\",\n `\n @article{article_id,\n author = {Author List},\n title = {Dataset Paper Title},\n journal = {Publication Venue},\n year = {2525}\n }\n `,\n \"If the dataset has a [DOI](https://www.doi.org/), please provide it here.\"\n ]\n },\n }\n}","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/InputField.js",[],{"ruleId":"29","replacedBy":"30"},{"ruleId":"31","replacedBy":"32"},{"ruleId":"33","severity":1,"message":"34","line":207,"column":71,"nodeType":"35","endLine":207,"endColumn":86},{"ruleId":"33","severity":1,"message":"34","line":217,"column":99,"nodeType":"35","endLine":217,"endColumn":114},{"ruleId":"36","severity":1,"message":"37","line":3,"column":1,"nodeType":"38","endLine":166,"endColumn":2},"no-native-reassign",["39"],"no-negated-in-lhs",["40"],"react/jsx-no-target-blank","Using target=\"_blank\" without rel=\"noreferrer\" is a security risk: see https://html.spec.whatwg.org/multipage/links.html#link-type-noopener","JSXAttribute","import/no-anonymous-default-export","Assign object to a variable before exporting as module default","ExportDefaultDeclaration","no-global-assign","no-unsafe-negation"]
 
1
+ [{"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/index.js":"1","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/reportWebVitals.js":"2","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/App.js":"3","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/Instructions.js":"4","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/InputField.js":"5","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/Section.js":"6"},{"size":500,"mtime":1607266233636,"results":"7","hashOfConfig":"8"},{"size":362,"mtime":1607349763851,"results":"9","hashOfConfig":"8"},{"size":15989,"mtime":1607366079905,"results":"10","hashOfConfig":"8"},{"size":25546,"mtime":1607362560939,"results":"11","hashOfConfig":"8"},{"size":666,"mtime":1607366243409,"results":"12","hashOfConfig":"8"},{"size":645,"mtime":1607365967634,"results":"13","hashOfConfig":"8"},{"filePath":"14","messages":"15","errorCount":0,"warningCount":0,"fixableErrorCount":0,"fixableWarningCount":0,"usedDeprecatedRules":"16"},"d40mw9",{"filePath":"17","messages":"18","errorCount":0,"warningCount":0,"fixableErrorCount":0,"fixableWarningCount":0,"usedDeprecatedRules":"16"},{"filePath":"19","messages":"20","errorCount":0,"warningCount":3,"fixableErrorCount":0,"fixableWarningCount":0,"source":null},{"filePath":"21","messages":"22","errorCount":0,"warningCount":3,"fixableErrorCount":0,"fixableWarningCount":0,"source":"23","usedDeprecatedRules":"16"},{"filePath":"24","messages":"25","errorCount":0,"warningCount":0,"fixableErrorCount":0,"fixableWarningCount":0},{"filePath":"26","messages":"27","errorCount":0,"warningCount":0,"fixableErrorCount":0,"fixableWarningCount":0},"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/index.js",[],["28","29"],"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/reportWebVitals.js",[],"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/App.js",["30","31","32"],"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/Instructions.js",["33","34","35"],"const NAME = 'Instructions'\n\nexport default {\n name: NAME,\n instructions: {\n homepage: {\n paragraph: [\n \"Add homepage URL here if available (unless it's a GitHub repository)\"\n ],\n example: [\n \"https://facebookresearch.github.io/ELI5/explore.html\"\n ]\n },\n repository: {\n paragraph: [\n \"If the dataset is hosted on github or has a github homepage, add URL here\"\n ],\n example: [\n \"https://github.com/facebookresearch/ELI5\"\n ]\n }, \n paper: {\n paragraph: [\n \"If the dataset was introduced by a paper or there was a paper written describing the dataset, add URL here (landing page for Arxiv paper preferred)\"\n ],\n example: [\n \"https://arxiv.org/abs/1907.09190\"\n ]\n }, \n leaderboard: {\n paragraph: [\n \"If the dataset supports an active leaderboard, add link here\"\n ],\n example: [\n \"\"\n ]\n }, \n contact: {\n paragraph: [\n \"If known, name and email of at least one person the reader can contact for questions about the dataset.\"\n ],\n example: [\n \"yacine@huggingface.co\"\n ]\n }, \n datasetSummary: {\n paragraph: [\n \"Briefly summarize the dataset, its intended use and the supported tasks. Give an overview of how and why the dataset was created. The summary should explicitly mention the languages present in the dataset (possibly in broad terms, e.g. translations between several pairs of European languages), and describe the domain, topic, or genre covered.\"\n ],\n example: [\n \"The ELI5 dataset is an English-language dataset of questions and answers gathered from three subreddits were users ask factual questions requiring paragraph-length or longer answers. The dataset was created to support the task of open-domain long form abstractive question answering, and covers questions about general topics in its r/explainlikeimfive subset, science in it r/askscience subset, and History in its r/AskHistorians subset.\"\n ]\n },\n supportedTasks: {\n paragraph: [\n \"For each of the tasks tagged for this dataset, give a brief description of the tag, metrics, and suggested models (with a link to their HuggingFace implementation if available). Give a similar description of tasks that were not covered by the structured tag set (repace the `task-category-tag` with an appropriate `other:other-task-name`).\",\n \"- `task-category-tag`: The dataset can be used to train a model for [TASK NAME], which consists in [TASK DESCRIPTION]. Success on this task is typically measured by achieving a *high/low* [metric name](https://huggingface.co/metrics/metric_name). The ([model name](https://huggingface.co/model_name) or [model class](https://huggingface.co/transformers/model_doc/model_class.html)) model currently achieves the following score. *[IF A LEADERBOARD IS AVAILABLE]:* This task has an active leaderboard which can be found at [leaderboard url]() and ranks models based on [metric name](https://huggingface.co/metrics/metric_name) while also reporting [other metric name](https://huggingface.co/metrics/other_metric_name).\"\n ],\n example: [\n \"abstractive-qa, open-domain-qa: The dataset can be used to train a model for Open Domain Long Form Question Answering. An LFQA model is presented with a non-factoid and asked to retrieve relevant information from a knowledge source (such as Wikipedia), then use it to generate a multi-sentence answer. The model performance is measured by how high its ROUGE score to the reference is. A BART-based model with a dense retriever trained to draw information from Wikipedia passages achieves a ROUGE-L of 0.149.\"\n ]\n },\n languages: {\n paragraph: [\n \"Provide a brief overview of the languages represented in the dataset. Describe relevant details about specifics of the language such as whether it is social media text, African American English,...\",\n \"When relevant, please provide [BCP-47 codes](https://tools.ietf.org/html/bcp47), which consist of a [primary language subtag](https://tools.ietf.org/html/bcp47#section-2.2.1), with a [script subtag](https://tools.ietf.org/html/bcp47#section-2.2.3) and/or [region subtag](https://tools.ietf.org/html/bcp47#section-2.2.4) if available.\"\n ],\n example: [\n \"The text in the dataset is in English, as spoken by Reddit users on the r/explainlikeimfive, r/askscience, and r/AskHistorians subreddits. The associated BCP-47 code is en.\"\n ]\n },\n dataInstances: {\n paragraph: [\n \"Provide an JSON-formatted example and brief description of a typical instance in the dataset. If available, provide a link to further examples.\",\n `\n {\n 'example_field': ...,\n ...\n }\n `,\n \"Provide any additional information that is not covered in the other sections about the data here. In particular describe any relationships between data points and if these relationships are made explicit.\",\n ],\n example: [\n \"A typical data point comprises a question, with a title containing the main question and a selftext which sometimes elaborates on it, and a list of answers from the forum sorted by the number of upvotes they obtained. Additionally, the URLs in each of the text fields have been extracted to respective lists and replaced by generic tokens in the text.\",\n \"An example from the ELI5 test set looks as follows:\",\n `{'q_id': '8houtx',`,\n `title': 'Why does water heated to room temperature feel colder than the air around it?,`,\n `selftext': '',`,\n `document': '',`,\n `subreddit': 'explainlikeimfive',`,\n `answers': {'a_id': ['dylcnfk', 'dylcj49'],`,\n `text': [\"Water transfers heat more efficiently than air. When something feels cold it's because heat is being transferred from your skin to whatever you're touching. Since water absorbs the heat more readily than air, it feels colder.\",\n \"Air isn't as good at transferring heat compared to something like water or steel (sit on a room temperature steel bench vs. a room temperature wooden bench, and the steel one will feel more cold).\\n\\nWhen you feel cold, what you're feeling is heat being transferred out of you. If there is no breeze, you feel a certain way. If there's a breeze, you will get colder faster (because the moving air is pulling the heat away from you), and if you get into water, its quite good at pulling heat from you. Get out of the water and have a breeze blow on you while you're wet, all of the water starts evaporating, pulling even more heat from you.\"],`,\n `score': [5, 2]},`,\n `title_urls': {'url': []},`,\n `selftext_urls': {'url': []},`,\n `answers_urls': {'url': []}}{'q_id': '8houtx',`,\n `title': 'Why does water heated to room temperature feel colder than the air around it?',`,\n `selftext': '',`,\n `document': '',`,\n `subreddit': 'explainlikeimfive',`,\n `answers': {'a_id': ['dylcnfk', 'dylcj49'],`,\n `text': [\"Water transfers heat more efficiently than air. When something feels cold it's because heat is being transferred from your skin to whatever you're touching. Since water absorbs the heat more readily than air, it feels colder.\",\n \"Air isn't as good at transferring heat compared to something like water or steel (sit on a room temperature steel bench vs. a room temperature wooden bench, and the steel one will feel more cold).\\n\\nWhen you feel cold, what you're feeling is heat being transferred out of you. If there is no breeze, you feel a certain way. If there's a breeze, you will get colder faster (because the moving air is pulling the heat away from you), and if you get into water, its quite good at pulling heat from you. Get out of the water and have a breeze blow on you while you're wet, all of the water starts evaporating, pulling even more heat from you.\"]`,\n `score': [5, 2]},`,\n `title_urls': {'url': []},`,\n `selftext_urls': {'url': []},`,\n `answers_urls': {'url': []}},`\n ]\n },\n dataFields: {\n paragraph: [\n \"List and describe the fields present in the dataset. Mention their data type, and whether they are used as input or output in any of the tasks the dataset currently supports. If the data has span indices, describe their attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc. If the datasets contains example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points.\",\n \"- `example_field`: description of `example_field`\"\n ], \n example: [\n `q_id: a string question identifier for each example, corresponding to its ID in the Pushshift.io Reddit submission dumps.`,\n `subreddit: One of explainlikeimfive, askscience, or AskHistorians, indicating which subreddit the question came from`,\n `title: title of the question, with URLs extracted and replaced by URL_n tokens`,\n `title_urls: list of the extracted URLs, the nth element of the list was replaced by URL_n`,\n `selftext: either an empty string or an elaboration of the question`,\n `selftext_urls: similar to title_urls but for self_text`,\n `answers: a list of answers, each answer has:`,\n `a_id: a string answer identifier for each answer, corresponding to its ID in the Pushshift.io Reddit comments dumps.`,\n `text: the answer text with the URLs normalized`,\n `score: the number of upvotes the answer had received when the dumps were created`,\n `answers_urls: a list of the extracted URLs. All answers use the same list, the numbering of the normalization token continues across answer texts`,\n ]\n },\n dataSplits: {\n paragraph: [\n \"Describe and name the splits in the dataset if there are more than one.\",\n \"Describe any criteria for splitting the data, if used. If their are differences between the splits (e.g. if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.\",\n \"Provide the sizes of each split. As appropriate, provide any descriptive statistics for the features, such as average length. For example:\",\n `\tTain\tValid\tTest\n Input Sentences \t\n Average Sentence Length`,\n ],\n example: [\n \"The data is split into a training, validation and test set for each of the three subreddits. In order to avoid having duplicate questions in across sets, the title field of each of the questions were ranked by their tf-idf match to their nearest neighbor and the ones with the smallest value were used in the test and validation sets. The final split sizes are as follow:\",\n `\tTain\tValid\tTest\n r/explainlikeimfive examples\t272634\t9812\t24512\n r/askscience examples\t131778\t2281\t4462\n r/AskHistorians examples\t98525\t4901\t9764`\n ]\n },\n curationRationale: {\n paragraph: [\n \"What need motivated the creation of this dataset? What are some of the reasons underlying the major choices involved in putting it together?\",\n ],\n example: [\n \"ELI5 was built to provide a testbed for machines to learn how to answer more complex questions, which requires them to find and combine information in a coherent manner. The dataset was built by gathering questions that were asked by community members of three subreddits, including r/explainlikeimfive, along with the answers that were provided by other users. The rules of the subreddit make this data particularly well suited to training a model for abstractive question answering: the questions need to seek an objective explanation about well established facts, and the answers provided need to be understandable to a layperson without any particular knowledge domain.\"\n ]\n },\n dataCollection: {\n paragraph: [\n \"Describe the data collection process. Describe any criteria for data selection or filtering. List any key words or search terms used. If possible, include runtime information for the collection process.\",\n \"If data was collected from other pre-existing datasets, link to source here and to their [Hugging Face version](https://huggingface.co/datasets/dataset_name).\",\n \"If the data was modified or normalized after being collected (e.g. if the data is word-tokenized), describe the process and the tools used.\"\n ],\n example:[\n \"The data was obtained by filtering submissions and comments from the subreddits of interest from the XML dumps of the Reddit forum hosted on Pushshift.io.\",\n \"In order to further improve the quality of the selected examples, only questions with a score of at least 2 and at least one answer with a score of at least 2 were selected for the dataset. The dataset questions and answers span a period form August 2012 to August 2019.\"\n ]\n },\n sourceLanguage: {\n paragraph: [\n \"State whether the data was produced by humans or machine generated. Describe the people or systems who originally created the data.\",\n \"If available, include self-reported demographic or identity information for the source data creators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.\",\n \"Describe the conditions under which the data was created (for example, if the producers were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here.\",\n \"Describe other people represented or mentioned in the data. Where possible, link to references for the information.\"\n ],\n example: [\n \"The language producers are users of the r/explainlikeimfive, r/askscience, and r/AskHistorians subreddits between 2012 and 2019. No further demographic information was available from the data source.\"\n ]\n },\n annotations: {\n paragraph: [\n \"If the dataset contains annotations which are not part of the initial data collection, describe them in the following paragraphs.\"\n ],\n example: [\n \"The dataset does not contain any additional annotations.\"\n ]\n },\n annotationProcess: {\n paragraph: [\n \"If applicable, describe the annotation process and any tools used, or state otherwise. Describe the amount of data annotated, if not all. Describe or reference annotation guidelines provided to the annotators. If available, provide interannotator statistics. Describe any annotation validation processes.\"\n ],\n example: [\n \"[N/A]\"\n ]\n },\n annotators: {\n paragraph: [\n \"If annotations were collected for the source data (such as class labels or syntactic parses), state whether the annotations were produced by humans or machine generated.\",\n \"Describe the people or systems who originally created the annotations and their selection criteria if applicable.\",\n \"If available, include self-reported demographic or identity information for the annotators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.\",\n \"Describe the conditions under which the data was annotated (for example, if the annotators were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here.\"\n ],\n example: [\n \"[N/A]\"\n ]\n },\n personalInformation: {\n paragraph: [\n \"State whether the dataset uses identity categories and, if so, how the information is used. Describe where this information comes from (i.e. self-reporting, collecting from profiles, inferring, etc.). See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender. State whether the data is linked to individuals and whether those individuals can be identified in the dataset, either directly or indirectly (i.e., in combination with other data).\",\n \"State whether the dataset contains other data that might be considered sensitive (e.g., data that reveals racial or ethnic origins, sexual orientations, religious beliefs, political opinions or union memberships, or locations; financial or health data; biometric or genetic data; forms of government identification, such as social security numbers; criminal history).\",\n \"If efforts were made to anonymize the data, describe the anonymization process.\"\n ],\n example: [\n \"The authors removed the speaker IDs from the Pushshift.io dumps but did not otherwise anonymize the data. Some of the questions and answers are about contemporary public figures or individuals who appeared in the news.\"\n ]\n },\n socialImpact: {\n paragraph: [\n \"Please discuss some of the ways you believe the use of this dataset will impact society.\",\n \"The statement should include both positive outlooks, such as outlining how technologies developed through its use may improve people's lives, and discuss the accompanying risks. These risks may range from making important decisions more opaque to people who are affected by the technology, to reinforcing existing harmful biases (whose specifics should be discussed in the next section), among other considerations.\",\n \"Please also mention in this section if the proposed dataset contains a *low-resource* or under-represented language.\"\n ],\n example: [\n \"The purpose of this dataset is to help develop better question answering systems.\",\n \"A system that succeeds at the supported task would be able to provide a coherent answer to even complex questions requiring a multi-step explanation, which is beyond the ability of even the larger existing models. The task is also thought as a test-bed for retrieval model which can show the users which source text was used in generating the answer and allow them to confirm the information provided to them.\",\n \"It should be noted however that the provided answers were written by Reddit users, an information which may be lost if models trained on it are deployed in down-stream applications and presented to users without context. The specific biases this may introduce are discussed in the next section.\",\n ]\n },\n biasesDiscussion: {\n paragraph: [\n \"Provide descriptions of specific biases that are likely to be reflected in the data, and state whether any steps were taken to reduce their impact.\",\n \"For Wikipedia text, see for example [Dinan et al 2020 on biases in Wikipedia (esp. Table 1)](https://arxiv.org/abs/2005.00614), or [Blodgett et al 2020](https://www.aclweb.org/anthology/2020.acl-main.485/) for a more general discussion of the topic.\",\n \"If analyses have been run quantifying these biases, please add brief summaries and links to the studies here.\"\n ],\n example: [\n \"While Reddit hosts a number of thriving communities with high quality discussions, it is also widely known to have corners where sexism, hate, and harassment are significant issues. See for example the recent post from Reddit founder u/spez outlining some of the ways he thinks the website's historical policies have been responsible for this problem, Adrienne Massanari's 2015 article on GamerGate and follow-up works, or a 2019 Wired article on misogyny on Reddit.\",\n \"While there has been some recent work in the NLP community on de-biasing models (e.g. Black is to Criminal as Caucasian is to Police: Detecting and Removing Multiclass Bias in Word Embeddings for word embeddings trained specifically on Reddit data), this problem is far from solved, and the likelihood that a trained model might learn the biases present in the data remains a significant concern.\",\n `We still note some encouraging signs for all of these communities: r/explainlikeimfive and r/askscience have similar structures and purposes, and r/askscience was found in 2015 to show medium supportiveness and very low toxicity when compared to other subreddits (see a hackerfall post, thecut.com write-up and supporting data). Meanwhile, the r/AskHistorians rules mention that the admins will not tolerate \"racism, sexism, or any other forms of bigotry\". However, further analysis of whether and to what extent these rules reduce toxicity is still needed.`,\n \"We also note that given the audience of the Reddit website which is more broadly used in the US and Europe, the answers will likely present a Western perspectives, which is particularly important to note when dealing with historical topics.\"\n ]\n },\n limitations: {\n paragraph: [\n \"If studies of the datasets have outlined other limitations of the dataset, such as annotation artifacts, please outline and cite them here.\"\n ],\n example: [\n \"The answers provided in the dataset are represent the opinion of Reddit users. While these communities strive to be helpful, they should not be considered to represent a ground truth.\"\n ]\n },\n datasetCurators: {\n paragraph: [\n \"List the people involved in collecting the dataset and their affiliation(s). If funding information is known, include it here.\"\n ],\n example: [\n \"The dataset was initially created by Angela Fan, Ethan Perez, Yacine Jernite, Jason Weston, Michael Auli, and David Grangier, during work done at Facebook AI Research (FAIR).\"\n ]\n },\n licensingInformation: {\n paragraph: [\n \"Provide the license and link to the license webpage if available.\"\n ],\n example: [\n \"The licensing status of the dataset hinges on the legal status of the Pushshift.io data which is unclear.\"\n ]\n },\n citationInformation: {\n paragraph: [\n \"Provide the [BibTex](http://www.bibtex.org/)-formatted reference for the dataset. For example:\",\n `\n @article{article_id,\n author = {Author List},\n title = {Dataset Paper Title},\n journal = {Publication Venue},\n year = {2525}\n }\n `,\n \"If the dataset has a [DOI](https://www.doi.org/), please provide it here.\"\n ],\n example: [\n `@inproceedings{eli5_lfqa`,\n `author = {Angela Fan and\n Yacine Jernite and\n Ethan Perez and\n David Grangier and\n Jason Weston and\n Michael Auli}`,\n `editor = {Anna Korhonen and\n David R. Traum and\n Llu{\\'{\\i}}s M{\\`{a}}rquez}`,\n `title = {{ELI5:} Long Form Question Answering}`,\n `booktitle = {Proceedings of the 57th Conference of the Association for Computational\n Linguistics, {ACL} 2019, Florence, Italy, July 28- August 2, 2019,\n Volume 1: Long Papers}`,\n `pages = {3558--3567}`,\n `publisher = {Association for Computational Linguistics}`,\n `year = {2019}`,\n `url = {https://doi.org/10.18653/v1/p19-1346}`,\n `doi = {10.18653/v1/p19-1346}\n }`\n ]\n },\n }\n}","/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/InputField.js",[],"/Users/evrardtserstevens/Documents/HuggingFace/datasetcard/src/Section.js",[],{"ruleId":"36","replacedBy":"37"},{"ruleId":"38","replacedBy":"39"},{"ruleId":"40","severity":1,"message":"41","line":338,"column":71,"nodeType":"42","endLine":338,"endColumn":86},{"ruleId":"40","severity":1,"message":"41","line":350,"column":71,"nodeType":"42","endLine":350,"endColumn":86},{"ruleId":"40","severity":1,"message":"41","line":359,"column":103,"nodeType":"42","endLine":359,"endColumn":118},{"ruleId":"43","severity":1,"message":"44","line":3,"column":1,"nodeType":"45","endLine":300,"endColumn":2},{"ruleId":"46","severity":1,"message":"47","line":286,"column":34,"nodeType":"48","messageId":"49","endLine":286,"endColumn":35,"suggestions":"50"},{"ruleId":"46","severity":1,"message":"51","line":286,"column":37,"nodeType":"48","messageId":"49","endLine":286,"endColumn":38,"suggestions":"52"},"no-native-reassign",["53"],"no-negated-in-lhs",["54"],"react/jsx-no-target-blank","Using target=\"_blank\" without rel=\"noreferrer\" is a security risk: see https://html.spec.whatwg.org/multipage/links.html#link-type-noopener","JSXAttribute","import/no-anonymous-default-export","Assign object to a variable before exporting as module default","ExportDefaultDeclaration","no-useless-escape","Unnecessary escape character: \\'.","TemplateElement","unnecessaryEscape",["55","56"],"Unnecessary escape character: \\i.",["57","58"],"no-global-assign","no-unsafe-negation",{"messageId":"59","fix":"60","desc":"61"},{"messageId":"62","fix":"63","desc":"64"},{"messageId":"59","fix":"65","desc":"61"},{"messageId":"62","fix":"66","desc":"64"},"removeEscape",{"range":"67","text":"68"},"Remove the `\\`. This maintains the current functionality.","escapeBackslash",{"range":"69","text":"70"},"Replace the `\\` with `\\\\` to include the actual backslash character.",{"range":"71","text":"68"},{"range":"72","text":"70"},[24868,24869],"",[24868,24868],"\\",[24871,24872],[24871,24871]]
package-lock.json CHANGED
@@ -14815,6 +14815,11 @@
14815
  "resolved": "https://registry.npmjs.org/react-error-overlay/-/react-error-overlay-6.0.8.tgz",
14816
  "integrity": "sha512-HvPuUQnLp5H7TouGq3kzBeioJmXms1wHy9EGjz2OURWBp4qZO6AfGEcnxts1D/CbwPLRAgTMPCEgYhA3sEM4vw=="
14817
  },
 
 
 
 
 
14818
  "react-is": {
14819
  "version": "16.13.1",
14820
  "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
 
14815
  "resolved": "https://registry.npmjs.org/react-error-overlay/-/react-error-overlay-6.0.8.tgz",
14816
  "integrity": "sha512-HvPuUQnLp5H7TouGq3kzBeioJmXms1wHy9EGjz2OURWBp4qZO6AfGEcnxts1D/CbwPLRAgTMPCEgYhA3sEM4vw=="
14817
  },
14818
+ "react-icons": {
14819
+ "version": "4.1.0",
14820
+ "resolved": "https://registry.npmjs.org/react-icons/-/react-icons-4.1.0.tgz",
14821
+ "integrity": "sha512-FCXBg1JbbR0vWALXIxmFAfozHdVIJmmwCD81Jk0EKOt7Ax4AdBNcaRkWhR0NaKy9ugJgoY3fFvo0PHpte55pXg=="
14822
+ },
14823
  "react-is": {
14824
  "version": "16.13.1",
14825
  "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
package.json CHANGED
@@ -2,6 +2,7 @@
2
  "name": "datasetcard",
3
  "version": "0.1.0",
4
  "private": true,
 
5
  "dependencies": {
6
  "@craco/craco": "^5.9.0",
7
  "@testing-library/jest-dom": "^5.11.6",
@@ -12,6 +13,7 @@
12
  "postcss": "^7.0.35",
13
  "react": "^17.0.1",
14
  "react-dom": "^17.0.1",
 
15
  "react-markdown": "^5.0.3",
16
  "react-scripts": "4.0.1",
17
  "save-file": "^2.3.1",
 
2
  "name": "datasetcard",
3
  "version": "0.1.0",
4
  "private": true,
5
+ "homepage": ".",
6
  "dependencies": {
7
  "@craco/craco": "^5.9.0",
8
  "@testing-library/jest-dom": "^5.11.6",
 
13
  "postcss": "^7.0.35",
14
  "react": "^17.0.1",
15
  "react-dom": "^17.0.1",
16
+ "react-icons": "^4.1.0",
17
  "react-markdown": "^5.0.3",
18
  "react-scripts": "4.0.1",
19
  "save-file": "^2.3.1",
src/App.js CHANGED
@@ -1,13 +1,22 @@
1
  import React, { useState } from 'react';
2
  import InputField from "./InputField"
 
3
  import Instructions from './Instructions'
4
  import ReactMarkdown from "react-markdown";
5
  import {save} from 'save-file'
6
 
 
7
  function App() {
8
 
9
  const [fieldFocussed, setFieldFocussed] = useState()
10
  const [card, setCard] = useState({})
 
 
 
 
 
 
 
11
 
12
  async function handleClick(e){
13
  setFieldFocussed(e.target.id)
@@ -17,9 +26,81 @@ function App() {
17
  setCard({...card, [e.target.id]:e.currentTarget.value})
18
  }
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  async function exportFile(card){
21
  var textTest = `
22
- # Dataset Card for [Dataset Name]
 
 
23
 
24
  ## Table of Contents
25
  - [Dataset Description](#dataset-description)
@@ -147,7 +228,7 @@ function App() {
147
  <div className="max-h-screen flex overflow-hidden bg-white">
148
  <div className="max-w-7xl mb-32 mx-auto py-16 px-4 sm:py-24 sm:px-6 lg:px-8">
149
  <div className="text-center">
150
- <p className="mt-1 text-4xl font-extrabold text-gray-700 sm:tracking-tight">New Dataset Card</p>
151
  <p className="max-w-xl mt-5 mx-auto text-lg text-gray-500">Fill in the form below</p>
152
  </div>
153
  <div className="flex justify-end">
@@ -159,29 +240,78 @@ function App() {
159
  <div className="max-w-7xl px-4 divide-y-2 divide-gray-200 sm:px-6 lg:px-8">
160
  <div className="">
161
  <dl className="space-y-8 divide-y p-6 divide-gray-200">
 
 
 
 
 
 
 
 
 
 
 
162
  <InputField title={"Homepage"} id={"homepage"} rows={1} handleClick={handleClick} handleChange={handleChange} />
163
  <InputField title={"Repository"} id={"repository"} rows={1} handleClick={handleClick} handleChange={handleChange} />
164
  <InputField title={"Paper"} id={"paper"} rows={1} handleClick={handleClick} handleChange={handleChange} />
165
  <InputField title={"Leaderboard"} id={"leaderboard"} rows={1} handleClick={handleClick} handleChange={handleChange} />
166
  <InputField title={"Point of Contact"} id={"contact"} rows={1} handleClick={handleClick} handleChange={handleChange} />
 
 
 
 
 
 
 
167
  <InputField title={"Dataset Summary"} id={"datasetSummary"} rows={4} handleClick={handleClick} handleChange={handleChange} />
168
  <InputField title={"Supported Tasks and Leaderboards"} id={"supportedTasks"} rows={4} handleClick={handleClick} handleChange={handleChange} />
169
  <InputField title={"Languages"} id={"languages"} rows={4} handleClick={handleClick} handleChange={handleChange} />
 
 
 
 
 
 
 
170
  <InputField title={"Data Instances"} id={"dataInstances"} rows={4} handleClick={handleClick} handleChange={handleChange} />
171
  <InputField title={"Data Fields"} id={"dataFields"} rows={4} handleClick={handleClick} handleChange={handleChange} />
172
  <InputField title={"Data Splits"} id={"dataSplits"} rows={4} handleClick={handleClick} handleChange={handleChange} />
 
 
 
 
 
 
 
173
  <InputField title={"Curation Rationale"} id={"curationRationale"} rows={4} handleClick={handleClick} handleChange={handleChange} />
174
  <InputField title={"Initial Data Collection and Normalization"} id={"dataCollection"} rows={4} handleClick={handleClick} handleChange={handleChange} />
175
  <InputField title={"Who are the source language producers?"} id={"sourceLanguage"} rows={4} handleClick={handleClick} handleChange={handleChange} />
176
  <InputField title={"Annotation Process"} id={"annotationProcess"} rows={4} handleClick={handleClick} handleChange={handleChange} />
177
  <InputField title={"Who are the annotators?"} id={"annotators"} rows={4} handleClick={handleClick} handleChange={handleChange} />
178
  <InputField title={"Personal and Sensitive Information"} id={"personalInformation"} rows={4} handleClick={handleClick} handleChange={handleChange} />
 
 
 
 
 
 
 
179
  <InputField title={"Social Impact of Dataset"} id={"socialImpact"} rows={4} handleClick={handleClick} handleChange={handleChange} />
180
  <InputField title={"Discussion of Biases"} id={"biasesDiscussion"} rows={4} handleClick={handleClick} handleChange={handleChange} />
181
  <InputField title={"Other Known Limitations"} id={"limitations"} rows={4} handleClick={handleClick} handleChange={handleChange} />
 
 
 
 
 
 
 
182
  <InputField title={"Dataset Curators"} id={"datasetCurators"} rows={4} handleClick={handleClick} handleChange={handleChange} />
183
  <InputField title={"Licensing Information"} id={"licensingInformation"} rows={4} handleClick={handleClick} handleChange={handleChange} />
184
  <InputField title={"Citation Information"} id={"citationInformation"} rows={4} handleClick={handleClick} handleChange={handleChange} />
 
 
 
185
  </dl>
186
  </div>
187
  </div>
@@ -191,16 +321,17 @@ function App() {
191
  </div>
192
  <div className="col-span-4">
193
  <div className="h-screen flex overflow-hidden bg-gray-100">
194
- <div className="max-w-7xl mx-auto py-16 px-4 sm:py-24 sm:px-6 lg:px-8">
195
  <div className="text-center">
196
- <p className="mt-1 text-4xl font-extrabold text-gray-700 sm:tracking-tight">Instructions</p>
197
  {!fieldFocussed &&
198
- <p className="max-w-xl mt-5 mx-auto text-lg text-gray-500">Click on a field to see instructions</p>
199
  }
200
  </div>
201
- <div className="max-w-7xl mx-auto py-12 px-4 divide-y-2 divide-gray-200 sm:px-6 lg:py-16 lg:px-8">
202
  <div className="mt-6">
203
  <dl className="space-y-8 divide-gray-200 text-gray-600 text-left">
 
204
  {Instructions.instructions[fieldFocussed] && Instructions.instructions[fieldFocussed].paragraph.map((para) => (
205
  <div key={para}>
206
  <ReactMarkdown source={para}
@@ -210,7 +341,18 @@ function App() {
210
  ))}
211
  </dl>
212
  </div>
213
-
 
 
 
 
 
 
 
 
 
 
 
214
  </div>
215
  <div className="absolute bottom-0 text-xs left-0 ml-4 text-gray-500">
216
  developed by
@@ -222,6 +364,12 @@ function App() {
222
  </div>
223
  </div>
224
  </div>
 
 
 
 
 
 
225
  </div>
226
  );
227
  }
 
1
  import React, { useState } from 'react';
2
  import InputField from "./InputField"
3
+ import Section from "./Section"
4
  import Instructions from './Instructions'
5
  import ReactMarkdown from "react-markdown";
6
  import {save} from 'save-file'
7
 
8
+
9
  function App() {
10
 
11
  const [fieldFocussed, setFieldFocussed] = useState()
12
  const [card, setCard] = useState({})
13
+ const [tagsSection, setTagsSection] = useState(false)
14
+ const [urlsSection, setUrlsSection] = useState(false)
15
+ const [datasetDescriptionSection, setDatasetDescriptionSection] = useState(false)
16
+ const [datasetStructureSection, setDatasetStructureSection] = useState(false)
17
+ const [datasetCreationSection, setDatasetCreationSection] = useState(false)
18
+ const [considerationsSection, setConsiderationsSection] = useState(false)
19
+ const [additionalInformationSection, setAdditionalInformationSection] = useState(false)
20
 
21
  async function handleClick(e){
22
  setFieldFocussed(e.target.id)
 
26
  setCard({...card, [e.target.id]:e.currentTarget.value})
27
  }
28
 
29
+ async function handleTagsSection(){
30
+ setAdditionalInformationSection(false)
31
+ setConsiderationsSection(false)
32
+ setDatasetCreationSection(false)
33
+ setDatasetStructureSection(false)
34
+ setDatasetDescriptionSection(false)
35
+ setUrlsSection(false)
36
+ setTagsSection(!tagsSection)
37
+ }
38
+
39
+ async function handleUrlsSection(){
40
+ setAdditionalInformationSection(false)
41
+ setConsiderationsSection(false)
42
+ setDatasetCreationSection(false)
43
+ setDatasetStructureSection(false)
44
+ setDatasetDescriptionSection(false)
45
+ setTagsSection(false)
46
+ setUrlsSection(!urlsSection)
47
+ }
48
+
49
+ async function handleDatasetDescriptionSection(){
50
+ setAdditionalInformationSection(false)
51
+ setConsiderationsSection(false)
52
+ setDatasetCreationSection(false)
53
+ setDatasetStructureSection(false)
54
+ setTagsSection(false)
55
+ setUrlsSection(false)
56
+ setDatasetDescriptionSection(!datasetDescriptionSection)
57
+ }
58
+
59
+ async function handleDatasetStructureSection(){
60
+ setAdditionalInformationSection(false)
61
+ setConsiderationsSection(false)
62
+ setDatasetCreationSection(false)
63
+ setDatasetDescriptionSection(false)
64
+ setTagsSection(false)
65
+ setUrlsSection(false)
66
+ setDatasetStructureSection(!datasetStructureSection)
67
+ }
68
+
69
+ async function handleDatasetCreationSection(){
70
+ setAdditionalInformationSection(false)
71
+ setConsiderationsSection(false)
72
+ setDatasetDescriptionSection(false)
73
+ setTagsSection(false)
74
+ setUrlsSection(false)
75
+ setDatasetStructureSection(false)
76
+ setDatasetCreationSection(!datasetCreationSection)
77
+ }
78
+
79
+ async function handleConsiderationsSection(){
80
+ setAdditionalInformationSection(false)
81
+ setDatasetDescriptionSection(false)
82
+ setTagsSection(false)
83
+ setUrlsSection(false)
84
+ setDatasetStructureSection(false)
85
+ setDatasetCreationSection(false)
86
+ setConsiderationsSection(!considerationsSection)
87
+ }
88
+
89
+ async function handleAdditionalInformationSection(){
90
+ setDatasetDescriptionSection(false)
91
+ setTagsSection(false)
92
+ setUrlsSection(false)
93
+ setDatasetStructureSection(false)
94
+ setDatasetCreationSection(false)
95
+ setConsiderationsSection(false)
96
+ setAdditionalInformationSection(!additionalInformationSection)
97
+ }
98
+
99
  async function exportFile(card){
100
  var textTest = `
101
+ ${card.yamlTags}
102
+
103
+ # Dataset Card for ${card.datasetName}
104
 
105
  ## Table of Contents
106
  - [Dataset Description](#dataset-description)
 
228
  <div className="max-h-screen flex overflow-hidden bg-white">
229
  <div className="max-w-7xl mb-32 mx-auto py-16 px-4 sm:py-24 sm:px-6 lg:px-8">
230
  <div className="text-center">
231
+ <p className="mt-1 text-4xl font-extrabold text-gray-700 sm:tracking-tight">New Dataset Card for <input onChange={(e) => handleChange(e)} id="datasetName" placeholder="dataset name" maxLength="200" className="ml-4 py-4 text-4xl text-gray-600 w-80 border border-solid border-gray-200 border-none h-10 rounded-md shadow" /></p>
232
  <p className="max-w-xl mt-5 mx-auto text-lg text-gray-500">Fill in the form below</p>
233
  </div>
234
  <div className="flex justify-end">
 
240
  <div className="max-w-7xl px-4 divide-y-2 divide-gray-200 sm:px-6 lg:px-8">
241
  <div className="">
242
  <dl className="space-y-8 divide-y p-6 divide-gray-200">
243
+
244
+ <Section title={"YAML Tags"} section={tagsSection} handleSection={handleTagsSection} />
245
+
246
+ {tagsSection &&
247
+ <InputField title={"YAML tags"} id={"yamlTags"} rows={4} handleClick={handleClick} handleChange={handleChange} />
248
+ }
249
+
250
+ <Section title={"Urls"} section={urlsSection} handleSection={handleUrlsSection} />
251
+
252
+ {urlsSection &&
253
+ <>
254
  <InputField title={"Homepage"} id={"homepage"} rows={1} handleClick={handleClick} handleChange={handleChange} />
255
  <InputField title={"Repository"} id={"repository"} rows={1} handleClick={handleClick} handleChange={handleChange} />
256
  <InputField title={"Paper"} id={"paper"} rows={1} handleClick={handleClick} handleChange={handleChange} />
257
  <InputField title={"Leaderboard"} id={"leaderboard"} rows={1} handleClick={handleClick} handleChange={handleChange} />
258
  <InputField title={"Point of Contact"} id={"contact"} rows={1} handleClick={handleClick} handleChange={handleChange} />
259
+ </>
260
+ }
261
+
262
+ <Section title={"Dataset Description"} section={datasetDescriptionSection} handleSection={handleDatasetDescriptionSection} />
263
+
264
+ {datasetDescriptionSection &&
265
+ <>
266
  <InputField title={"Dataset Summary"} id={"datasetSummary"} rows={4} handleClick={handleClick} handleChange={handleChange} />
267
  <InputField title={"Supported Tasks and Leaderboards"} id={"supportedTasks"} rows={4} handleClick={handleClick} handleChange={handleChange} />
268
  <InputField title={"Languages"} id={"languages"} rows={4} handleClick={handleClick} handleChange={handleChange} />
269
+ </>
270
+ }
271
+
272
+ <Section title={"Dataset Structure"} section={datasetStructureSection} handleSection={handleDatasetStructureSection} />
273
+
274
+ {datasetStructureSection &&
275
+ <>
276
  <InputField title={"Data Instances"} id={"dataInstances"} rows={4} handleClick={handleClick} handleChange={handleChange} />
277
  <InputField title={"Data Fields"} id={"dataFields"} rows={4} handleClick={handleClick} handleChange={handleChange} />
278
  <InputField title={"Data Splits"} id={"dataSplits"} rows={4} handleClick={handleClick} handleChange={handleChange} />
279
+ </>
280
+ }
281
+
282
+ <Section title={"Dataset Creation"} section={datasetCreationSection} handleSection={handleDatasetCreationSection} />
283
+
284
+ {datasetCreationSection &&
285
+ <>
286
  <InputField title={"Curation Rationale"} id={"curationRationale"} rows={4} handleClick={handleClick} handleChange={handleChange} />
287
  <InputField title={"Initial Data Collection and Normalization"} id={"dataCollection"} rows={4} handleClick={handleClick} handleChange={handleChange} />
288
  <InputField title={"Who are the source language producers?"} id={"sourceLanguage"} rows={4} handleClick={handleClick} handleChange={handleChange} />
289
  <InputField title={"Annotation Process"} id={"annotationProcess"} rows={4} handleClick={handleClick} handleChange={handleChange} />
290
  <InputField title={"Who are the annotators?"} id={"annotators"} rows={4} handleClick={handleClick} handleChange={handleChange} />
291
  <InputField title={"Personal and Sensitive Information"} id={"personalInformation"} rows={4} handleClick={handleClick} handleChange={handleChange} />
292
+ </>
293
+ }
294
+
295
+ <Section title={"Considerations for Using the Data"} section={considerationsSection} handleSection={handleConsiderationsSection} />
296
+
297
+ {considerationsSection &&
298
+ <>
299
  <InputField title={"Social Impact of Dataset"} id={"socialImpact"} rows={4} handleClick={handleClick} handleChange={handleChange} />
300
  <InputField title={"Discussion of Biases"} id={"biasesDiscussion"} rows={4} handleClick={handleClick} handleChange={handleChange} />
301
  <InputField title={"Other Known Limitations"} id={"limitations"} rows={4} handleClick={handleClick} handleChange={handleChange} />
302
+ </>
303
+ }
304
+
305
+ <Section title={"Additional InformationSection"} section={additionalInformationSection} handleSection={handleAdditionalInformationSection} />
306
+
307
+ {additionalInformationSection &&
308
+ <>
309
  <InputField title={"Dataset Curators"} id={"datasetCurators"} rows={4} handleClick={handleClick} handleChange={handleChange} />
310
  <InputField title={"Licensing Information"} id={"licensingInformation"} rows={4} handleClick={handleClick} handleChange={handleChange} />
311
  <InputField title={"Citation Information"} id={"citationInformation"} rows={4} handleClick={handleClick} handleChange={handleChange} />
312
+ </>
313
+ }
314
+
315
  </dl>
316
  </div>
317
  </div>
 
321
  </div>
322
  <div className="col-span-4">
323
  <div className="h-screen flex overflow-hidden bg-gray-100">
324
+ <div className="max-w-7xl mx-auto w-full overflow-y-auto py-16 px-4 sm:py-24">
325
  <div className="text-center">
326
+ <p className="mt-1 text-4xl font-extrabold text-gray-700 sm:tracking-tight">Information</p>
327
  {!fieldFocussed &&
328
+ <p className="max-w-xl mt-5 mx-auto text-lg text-gray-500">Click on a field to see instructions/example</p>
329
  }
330
  </div>
331
+ <div className="max-w-7xl text-left mx-auto py-12 px-4 divide-y-2 divide-gray-200 sm:px-6 lg:py-16 lg:px-8">
332
  <div className="mt-6">
333
  <dl className="space-y-8 divide-gray-200 text-gray-600 text-left">
334
+ <p className="mt-1 text-xl font-extrabold text-gray-700 sm:tracking-tight">Instructions</p>
335
  {Instructions.instructions[fieldFocussed] && Instructions.instructions[fieldFocussed].paragraph.map((para) => (
336
  <div key={para}>
337
  <ReactMarkdown source={para}
 
341
  ))}
342
  </dl>
343
  </div>
344
+ <div className="mt-12">
345
+ <dl className="space-y-8 divide-gray-200 text-gray-600 text-left">
346
+ <p className="mt-1 text-xl font-extrabold text-gray-700 sm:tracking-tight">Example</p>
347
+ {Instructions.instructions[fieldFocussed] && Instructions.instructions[fieldFocussed].example.map((ex) => (
348
+ <div key={ex}>
349
+ <ReactMarkdown source={ex}
350
+ renderers={{link: props => <a href={props.href} target="_blank">{props.children}</a>}}
351
+ />
352
+ </div>
353
+ ))}
354
+ </dl>
355
+ </div>
356
  </div>
357
  <div className="absolute bottom-0 text-xs left-0 ml-4 text-gray-500">
358
  developed by
 
364
  </div>
365
  </div>
366
  </div>
367
+ <style>{`
368
+ .borders {
369
+ border-bottom: solid 1px;
370
+ border-color: #e2e8f0;
371
+ }
372
+ `}</style>
373
  </div>
374
  );
375
  }
src/InputField.js CHANGED
@@ -3,13 +3,15 @@ import React from 'react';
3
  export default function InputField({ title, id, rows, handleClick, handleChange }) {
4
 
5
  return (
 
6
  <div className="pt-6 md:grid md:grid-cols-12 md:gap-8">
7
- <dt className="text-base font-medium text-gray-700 md:col-span-3">
8
  {title}
9
  </dt>
10
- <dd className="mt-2 md:mt-0 md:col-span-9">
11
  <textarea onClick={(e) => handleClick(e)} onChange={(e) => handleChange(e)} id={id} name={id} rows={rows} className="font-sans p-2 shadow-sm border border-solid border-gray-300 block w-full text-gray-600 sm:text-sm rounded-md"></textarea>
12
  </dd>
13
  </div>
 
14
  );
15
  }
 
3
  export default function InputField({ title, id, rows, handleClick, handleChange }) {
4
 
5
  return (
6
+ <div className="w-full">
7
  <div className="pt-6 md:grid md:grid-cols-12 md:gap-8">
8
+ <dt className="text-base font-medium max-w-40 text-gray-700 md:col-span-4">
9
  {title}
10
  </dt>
11
+ <dd className="mt-2 md:mt-0 md:col-span-8 mr-6">
12
  <textarea onClick={(e) => handleClick(e)} onChange={(e) => handleChange(e)} id={id} name={id} rows={rows} className="font-sans p-2 shadow-sm border border-solid border-gray-300 block w-full text-gray-600 sm:text-sm rounded-md"></textarea>
13
  </dd>
14
  </div>
15
+ </div>
16
  );
17
  }
src/Instructions.js CHANGED
@@ -7,42 +7,66 @@ export default {
7
  paragraph: [
8
  "Add homepage URL here if available (unless it's a GitHub repository)"
9
  ],
 
 
 
10
  },
11
  repository: {
12
  paragraph: [
13
  "If the dataset is hosted on github or has a github homepage, add URL here"
14
  ],
 
 
 
15
  },
16
  paper: {
17
  paragraph: [
18
  "If the dataset was introduced by a paper or there was a paper written describing the dataset, add URL here (landing page for Arxiv paper preferred)"
19
  ],
 
 
 
20
  },
21
  leaderboard: {
22
  paragraph: [
23
  "If the dataset supports an active leaderboard, add link here"
24
  ],
 
 
 
25
  },
26
  contact: {
27
  paragraph: [
28
  "If known, name and email of at least one person the reader can contact for questions about the dataset."
29
  ],
 
 
 
30
  },
31
  datasetSummary: {
32
  paragraph: [
33
  "Briefly summarize the dataset, its intended use and the supported tasks. Give an overview of how and why the dataset was created. The summary should explicitly mention the languages present in the dataset (possibly in broad terms, e.g. translations between several pairs of European languages), and describe the domain, topic, or genre covered."
34
  ],
 
 
 
35
  },
36
  supportedTasks: {
37
  paragraph: [
38
  "For each of the tasks tagged for this dataset, give a brief description of the tag, metrics, and suggested models (with a link to their HuggingFace implementation if available). Give a similar description of tasks that were not covered by the structured tag set (repace the `task-category-tag` with an appropriate `other:other-task-name`).",
39
  "- `task-category-tag`: The dataset can be used to train a model for [TASK NAME], which consists in [TASK DESCRIPTION]. Success on this task is typically measured by achieving a *high/low* [metric name](https://huggingface.co/metrics/metric_name). The ([model name](https://huggingface.co/model_name) or [model class](https://huggingface.co/transformers/model_doc/model_class.html)) model currently achieves the following score. *[IF A LEADERBOARD IS AVAILABLE]:* This task has an active leaderboard which can be found at [leaderboard url]() and ranks models based on [metric name](https://huggingface.co/metrics/metric_name) while also reporting [other metric name](https://huggingface.co/metrics/other_metric_name)."
 
 
 
40
  ]
41
  },
42
  languages: {
43
  paragraph: [
44
  "Provide a brief overview of the languages represented in the dataset. Describe relevant details about specifics of the language such as whether it is social media text, African American English,...",
45
  "When relevant, please provide [BCP-47 codes](https://tools.ietf.org/html/bcp47), which consist of a [primary language subtag](https://tools.ietf.org/html/bcp47#section-2.2.1), with a [script subtag](https://tools.ietf.org/html/bcp47#section-2.2.3) and/or [region subtag](https://tools.ietf.org/html/bcp47#section-2.2.4) if available."
 
 
 
46
  ]
47
  },
48
  dataInstances: {
@@ -55,12 +79,52 @@ export default {
55
  }
56
  `,
57
  "Provide any additional information that is not covered in the other sections about the data here. In particular describe any relationships between data points and if these relationships are made explicit.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  ]
59
  },
60
  dataFields: {
61
  paragraph: [
62
  "List and describe the fields present in the dataset. Mention their data type, and whether they are used as input or output in any of the tasks the dataset currently supports. If the data has span indices, describe their attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc. If the datasets contains example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points.",
63
  "- `example_field`: description of `example_field`"
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  ]
65
  },
66
  dataSplits: {
@@ -68,15 +132,24 @@ export default {
68
  "Describe and name the splits in the dataset if there are more than one.",
69
  "Describe any criteria for splitting the data, if used. If their are differences between the splits (e.g. if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.",
70
  "Provide the sizes of each split. As appropriate, provide any descriptive statistics for the features, such as average length. For example:",
71
- `| | Tain | Valid | Test |
72
- | ----- | ------ | ----- | ---- |
73
- | Input Sentences | | | |
74
- | Average Sentence Length | | | |`
 
 
 
 
 
 
75
  ]
76
  },
77
  curationRationale: {
78
  paragraph: [
79
  "What need motivated the creation of this dataset? What are some of the reasons underlying the major choices involved in putting it together?",
 
 
 
80
  ]
81
  },
82
  dataCollection: {
@@ -84,6 +157,10 @@ export default {
84
  "Describe the data collection process. Describe any criteria for data selection or filtering. List any key words or search terms used. If possible, include runtime information for the collection process.",
85
  "If data was collected from other pre-existing datasets, link to source here and to their [Hugging Face version](https://huggingface.co/datasets/dataset_name).",
86
  "If the data was modified or normalized after being collected (e.g. if the data is word-tokenized), describe the process and the tools used."
 
 
 
 
87
  ]
88
  },
89
  sourceLanguage: {
@@ -92,16 +169,25 @@ export default {
92
  "If available, include self-reported demographic or identity information for the source data creators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.",
93
  "Describe the conditions under which the data was created (for example, if the producers were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here.",
94
  "Describe other people represented or mentioned in the data. Where possible, link to references for the information."
 
 
 
95
  ]
96
  },
97
  annotations: {
98
  paragraph: [
99
  "If the dataset contains annotations which are not part of the initial data collection, describe them in the following paragraphs."
 
 
 
100
  ]
101
  },
102
  annotationProcess: {
103
  paragraph: [
104
  "If applicable, describe the annotation process and any tools used, or state otherwise. Describe the amount of data annotated, if not all. Describe or reference annotation guidelines provided to the annotators. If available, provide interannotator statistics. Describe any annotation validation processes."
 
 
 
105
  ]
106
  },
107
  annotators: {
@@ -110,6 +196,9 @@ export default {
110
  "Describe the people or systems who originally created the annotations and their selection criteria if applicable.",
111
  "If available, include self-reported demographic or identity information for the annotators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.",
112
  "Describe the conditions under which the data was annotated (for example, if the annotators were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here."
 
 
 
113
  ]
114
  },
115
  personalInformation: {
@@ -117,6 +206,9 @@ export default {
117
  "State whether the dataset uses identity categories and, if so, how the information is used. Describe where this information comes from (i.e. self-reporting, collecting from profiles, inferring, etc.). See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender. State whether the data is linked to individuals and whether those individuals can be identified in the dataset, either directly or indirectly (i.e., in combination with other data).",
118
  "State whether the dataset contains other data that might be considered sensitive (e.g., data that reveals racial or ethnic origins, sexual orientations, religious beliefs, political opinions or union memberships, or locations; financial or health data; biometric or genetic data; forms of government identification, such as social security numbers; criminal history).",
119
  "If efforts were made to anonymize the data, describe the anonymization process."
 
 
 
120
  ]
121
  },
122
  socialImpact: {
@@ -124,6 +216,11 @@ export default {
124
  "Please discuss some of the ways you believe the use of this dataset will impact society.",
125
  "The statement should include both positive outlooks, such as outlining how technologies developed through its use may improve people's lives, and discuss the accompanying risks. These risks may range from making important decisions more opaque to people who are affected by the technology, to reinforcing existing harmful biases (whose specifics should be discussed in the next section), among other considerations.",
126
  "Please also mention in this section if the proposed dataset contains a *low-resource* or under-represented language."
 
 
 
 
 
127
  ]
128
  },
129
  biasesDiscussion: {
@@ -131,21 +228,36 @@ export default {
131
  "Provide descriptions of specific biases that are likely to be reflected in the data, and state whether any steps were taken to reduce their impact.",
132
  "For Wikipedia text, see for example [Dinan et al 2020 on biases in Wikipedia (esp. Table 1)](https://arxiv.org/abs/2005.00614), or [Blodgett et al 2020](https://www.aclweb.org/anthology/2020.acl-main.485/) for a more general discussion of the topic.",
133
  "If analyses have been run quantifying these biases, please add brief summaries and links to the studies here."
 
 
 
 
 
 
134
  ]
135
  },
136
  limitations: {
137
  paragraph: [
138
  "If studies of the datasets have outlined other limitations of the dataset, such as annotation artifacts, please outline and cite them here."
 
 
 
139
  ]
140
  },
141
  datasetCurators: {
142
  paragraph: [
143
  "List the people involved in collecting the dataset and their affiliation(s). If funding information is known, include it here."
 
 
 
144
  ]
145
  },
146
  licensingInformation: {
147
  paragraph: [
148
  "Provide the license and link to the license webpage if available."
 
 
 
149
  ]
150
  },
151
  citationInformation: {
@@ -160,6 +272,28 @@ export default {
160
  }
161
  `,
162
  "If the dataset has a [DOI](https://www.doi.org/), please provide it here."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  ]
164
  },
165
  }
 
7
  paragraph: [
8
  "Add homepage URL here if available (unless it's a GitHub repository)"
9
  ],
10
+ example: [
11
+ "https://facebookresearch.github.io/ELI5/explore.html"
12
+ ]
13
  },
14
  repository: {
15
  paragraph: [
16
  "If the dataset is hosted on github or has a github homepage, add URL here"
17
  ],
18
+ example: [
19
+ "https://github.com/facebookresearch/ELI5"
20
+ ]
21
  },
22
  paper: {
23
  paragraph: [
24
  "If the dataset was introduced by a paper or there was a paper written describing the dataset, add URL here (landing page for Arxiv paper preferred)"
25
  ],
26
+ example: [
27
+ "https://arxiv.org/abs/1907.09190"
28
+ ]
29
  },
30
  leaderboard: {
31
  paragraph: [
32
  "If the dataset supports an active leaderboard, add link here"
33
  ],
34
+ example: [
35
+ ""
36
+ ]
37
  },
38
  contact: {
39
  paragraph: [
40
  "If known, name and email of at least one person the reader can contact for questions about the dataset."
41
  ],
42
+ example: [
43
+ "yacine@huggingface.co"
44
+ ]
45
  },
46
  datasetSummary: {
47
  paragraph: [
48
  "Briefly summarize the dataset, its intended use and the supported tasks. Give an overview of how and why the dataset was created. The summary should explicitly mention the languages present in the dataset (possibly in broad terms, e.g. translations between several pairs of European languages), and describe the domain, topic, or genre covered."
49
  ],
50
+ example: [
51
+ "The ELI5 dataset is an English-language dataset of questions and answers gathered from three subreddits were users ask factual questions requiring paragraph-length or longer answers. The dataset was created to support the task of open-domain long form abstractive question answering, and covers questions about general topics in its r/explainlikeimfive subset, science in it r/askscience subset, and History in its r/AskHistorians subset."
52
+ ]
53
  },
54
  supportedTasks: {
55
  paragraph: [
56
  "For each of the tasks tagged for this dataset, give a brief description of the tag, metrics, and suggested models (with a link to their HuggingFace implementation if available). Give a similar description of tasks that were not covered by the structured tag set (repace the `task-category-tag` with an appropriate `other:other-task-name`).",
57
  "- `task-category-tag`: The dataset can be used to train a model for [TASK NAME], which consists in [TASK DESCRIPTION]. Success on this task is typically measured by achieving a *high/low* [metric name](https://huggingface.co/metrics/metric_name). The ([model name](https://huggingface.co/model_name) or [model class](https://huggingface.co/transformers/model_doc/model_class.html)) model currently achieves the following score. *[IF A LEADERBOARD IS AVAILABLE]:* This task has an active leaderboard which can be found at [leaderboard url]() and ranks models based on [metric name](https://huggingface.co/metrics/metric_name) while also reporting [other metric name](https://huggingface.co/metrics/other_metric_name)."
58
+ ],
59
+ example: [
60
+ "abstractive-qa, open-domain-qa: The dataset can be used to train a model for Open Domain Long Form Question Answering. An LFQA model is presented with a non-factoid and asked to retrieve relevant information from a knowledge source (such as Wikipedia), then use it to generate a multi-sentence answer. The model performance is measured by how high its ROUGE score to the reference is. A BART-based model with a dense retriever trained to draw information from Wikipedia passages achieves a ROUGE-L of 0.149."
61
  ]
62
  },
63
  languages: {
64
  paragraph: [
65
  "Provide a brief overview of the languages represented in the dataset. Describe relevant details about specifics of the language such as whether it is social media text, African American English,...",
66
  "When relevant, please provide [BCP-47 codes](https://tools.ietf.org/html/bcp47), which consist of a [primary language subtag](https://tools.ietf.org/html/bcp47#section-2.2.1), with a [script subtag](https://tools.ietf.org/html/bcp47#section-2.2.3) and/or [region subtag](https://tools.ietf.org/html/bcp47#section-2.2.4) if available."
67
+ ],
68
+ example: [
69
+ "The text in the dataset is in English, as spoken by Reddit users on the r/explainlikeimfive, r/askscience, and r/AskHistorians subreddits. The associated BCP-47 code is en."
70
  ]
71
  },
72
  dataInstances: {
 
79
  }
80
  `,
81
  "Provide any additional information that is not covered in the other sections about the data here. In particular describe any relationships between data points and if these relationships are made explicit.",
82
+ ],
83
+ example: [
84
+ "A typical data point comprises a question, with a title containing the main question and a selftext which sometimes elaborates on it, and a list of answers from the forum sorted by the number of upvotes they obtained. Additionally, the URLs in each of the text fields have been extracted to respective lists and replaced by generic tokens in the text.",
85
+ "An example from the ELI5 test set looks as follows:",
86
+ `{'q_id': '8houtx',`,
87
+ `title': 'Why does water heated to room temperature feel colder than the air around it?,`,
88
+ `selftext': '',`,
89
+ `document': '',`,
90
+ `subreddit': 'explainlikeimfive',`,
91
+ `answers': {'a_id': ['dylcnfk', 'dylcj49'],`,
92
+ `text': ["Water transfers heat more efficiently than air. When something feels cold it's because heat is being transferred from your skin to whatever you're touching. Since water absorbs the heat more readily than air, it feels colder.",
93
+ "Air isn't as good at transferring heat compared to something like water or steel (sit on a room temperature steel bench vs. a room temperature wooden bench, and the steel one will feel more cold).\n\nWhen you feel cold, what you're feeling is heat being transferred out of you. If there is no breeze, you feel a certain way. If there's a breeze, you will get colder faster (because the moving air is pulling the heat away from you), and if you get into water, its quite good at pulling heat from you. Get out of the water and have a breeze blow on you while you're wet, all of the water starts evaporating, pulling even more heat from you."],`,
94
+ `score': [5, 2]},`,
95
+ `title_urls': {'url': []},`,
96
+ `selftext_urls': {'url': []},`,
97
+ `answers_urls': {'url': []}}{'q_id': '8houtx',`,
98
+ `title': 'Why does water heated to room temperature feel colder than the air around it?',`,
99
+ `selftext': '',`,
100
+ `document': '',`,
101
+ `subreddit': 'explainlikeimfive',`,
102
+ `answers': {'a_id': ['dylcnfk', 'dylcj49'],`,
103
+ `text': ["Water transfers heat more efficiently than air. When something feels cold it's because heat is being transferred from your skin to whatever you're touching. Since water absorbs the heat more readily than air, it feels colder.",
104
+ "Air isn't as good at transferring heat compared to something like water or steel (sit on a room temperature steel bench vs. a room temperature wooden bench, and the steel one will feel more cold).\n\nWhen you feel cold, what you're feeling is heat being transferred out of you. If there is no breeze, you feel a certain way. If there's a breeze, you will get colder faster (because the moving air is pulling the heat away from you), and if you get into water, its quite good at pulling heat from you. Get out of the water and have a breeze blow on you while you're wet, all of the water starts evaporating, pulling even more heat from you."]`,
105
+ `score': [5, 2]},`,
106
+ `title_urls': {'url': []},`,
107
+ `selftext_urls': {'url': []},`,
108
+ `answers_urls': {'url': []}},`
109
  ]
110
  },
111
  dataFields: {
112
  paragraph: [
113
  "List and describe the fields present in the dataset. Mention their data type, and whether they are used as input or output in any of the tasks the dataset currently supports. If the data has span indices, describe their attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc. If the datasets contains example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points.",
114
  "- `example_field`: description of `example_field`"
115
+ ],
116
+ example: [
117
+ `q_id: a string question identifier for each example, corresponding to its ID in the Pushshift.io Reddit submission dumps.`,
118
+ `subreddit: One of explainlikeimfive, askscience, or AskHistorians, indicating which subreddit the question came from`,
119
+ `title: title of the question, with URLs extracted and replaced by URL_n tokens`,
120
+ `title_urls: list of the extracted URLs, the nth element of the list was replaced by URL_n`,
121
+ `selftext: either an empty string or an elaboration of the question`,
122
+ `selftext_urls: similar to title_urls but for self_text`,
123
+ `answers: a list of answers, each answer has:`,
124
+ `a_id: a string answer identifier for each answer, corresponding to its ID in the Pushshift.io Reddit comments dumps.`,
125
+ `text: the answer text with the URLs normalized`,
126
+ `score: the number of upvotes the answer had received when the dumps were created`,
127
+ `answers_urls: a list of the extracted URLs. All answers use the same list, the numbering of the normalization token continues across answer texts`,
128
  ]
129
  },
130
  dataSplits: {
 
132
  "Describe and name the splits in the dataset if there are more than one.",
133
  "Describe any criteria for splitting the data, if used. If their are differences between the splits (e.g. if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.",
134
  "Provide the sizes of each split. As appropriate, provide any descriptive statistics for the features, such as average length. For example:",
135
+ ` Tain Valid Test
136
+ Input Sentences
137
+ Average Sentence Length`,
138
+ ],
139
+ example: [
140
+ "The data is split into a training, validation and test set for each of the three subreddits. In order to avoid having duplicate questions in across sets, the title field of each of the questions were ranked by their tf-idf match to their nearest neighbor and the ones with the smallest value were used in the test and validation sets. The final split sizes are as follow:",
141
+ ` Tain Valid Test
142
+ r/explainlikeimfive examples 272634 9812 24512
143
+ r/askscience examples 131778 2281 4462
144
+ r/AskHistorians examples 98525 4901 9764`
145
  ]
146
  },
147
  curationRationale: {
148
  paragraph: [
149
  "What need motivated the creation of this dataset? What are some of the reasons underlying the major choices involved in putting it together?",
150
+ ],
151
+ example: [
152
+ "ELI5 was built to provide a testbed for machines to learn how to answer more complex questions, which requires them to find and combine information in a coherent manner. The dataset was built by gathering questions that were asked by community members of three subreddits, including r/explainlikeimfive, along with the answers that were provided by other users. The rules of the subreddit make this data particularly well suited to training a model for abstractive question answering: the questions need to seek an objective explanation about well established facts, and the answers provided need to be understandable to a layperson without any particular knowledge domain."
153
  ]
154
  },
155
  dataCollection: {
 
157
  "Describe the data collection process. Describe any criteria for data selection or filtering. List any key words or search terms used. If possible, include runtime information for the collection process.",
158
  "If data was collected from other pre-existing datasets, link to source here and to their [Hugging Face version](https://huggingface.co/datasets/dataset_name).",
159
  "If the data was modified or normalized after being collected (e.g. if the data is word-tokenized), describe the process and the tools used."
160
+ ],
161
+ example:[
162
+ "The data was obtained by filtering submissions and comments from the subreddits of interest from the XML dumps of the Reddit forum hosted on Pushshift.io.",
163
+ "In order to further improve the quality of the selected examples, only questions with a score of at least 2 and at least one answer with a score of at least 2 were selected for the dataset. The dataset questions and answers span a period form August 2012 to August 2019."
164
  ]
165
  },
166
  sourceLanguage: {
 
169
  "If available, include self-reported demographic or identity information for the source data creators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.",
170
  "Describe the conditions under which the data was created (for example, if the producers were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here.",
171
  "Describe other people represented or mentioned in the data. Where possible, link to references for the information."
172
+ ],
173
+ example: [
174
+ "The language producers are users of the r/explainlikeimfive, r/askscience, and r/AskHistorians subreddits between 2012 and 2019. No further demographic information was available from the data source."
175
  ]
176
  },
177
  annotations: {
178
  paragraph: [
179
  "If the dataset contains annotations which are not part of the initial data collection, describe them in the following paragraphs."
180
+ ],
181
+ example: [
182
+ "The dataset does not contain any additional annotations."
183
  ]
184
  },
185
  annotationProcess: {
186
  paragraph: [
187
  "If applicable, describe the annotation process and any tools used, or state otherwise. Describe the amount of data annotated, if not all. Describe or reference annotation guidelines provided to the annotators. If available, provide interannotator statistics. Describe any annotation validation processes."
188
+ ],
189
+ example: [
190
+ "[N/A]"
191
  ]
192
  },
193
  annotators: {
 
196
  "Describe the people or systems who originally created the annotations and their selection criteria if applicable.",
197
  "If available, include self-reported demographic or identity information for the annotators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.",
198
  "Describe the conditions under which the data was annotated (for example, if the annotators were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here."
199
+ ],
200
+ example: [
201
+ "[N/A]"
202
  ]
203
  },
204
  personalInformation: {
 
206
  "State whether the dataset uses identity categories and, if so, how the information is used. Describe where this information comes from (i.e. self-reporting, collecting from profiles, inferring, etc.). See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender. State whether the data is linked to individuals and whether those individuals can be identified in the dataset, either directly or indirectly (i.e., in combination with other data).",
207
  "State whether the dataset contains other data that might be considered sensitive (e.g., data that reveals racial or ethnic origins, sexual orientations, religious beliefs, political opinions or union memberships, or locations; financial or health data; biometric or genetic data; forms of government identification, such as social security numbers; criminal history).",
208
  "If efforts were made to anonymize the data, describe the anonymization process."
209
+ ],
210
+ example: [
211
+ "The authors removed the speaker IDs from the Pushshift.io dumps but did not otherwise anonymize the data. Some of the questions and answers are about contemporary public figures or individuals who appeared in the news."
212
  ]
213
  },
214
  socialImpact: {
 
216
  "Please discuss some of the ways you believe the use of this dataset will impact society.",
217
  "The statement should include both positive outlooks, such as outlining how technologies developed through its use may improve people's lives, and discuss the accompanying risks. These risks may range from making important decisions more opaque to people who are affected by the technology, to reinforcing existing harmful biases (whose specifics should be discussed in the next section), among other considerations.",
218
  "Please also mention in this section if the proposed dataset contains a *low-resource* or under-represented language."
219
+ ],
220
+ example: [
221
+ "The purpose of this dataset is to help develop better question answering systems.",
222
+ "A system that succeeds at the supported task would be able to provide a coherent answer to even complex questions requiring a multi-step explanation, which is beyond the ability of even the larger existing models. The task is also thought as a test-bed for retrieval model which can show the users which source text was used in generating the answer and allow them to confirm the information provided to them.",
223
+ "It should be noted however that the provided answers were written by Reddit users, an information which may be lost if models trained on it are deployed in down-stream applications and presented to users without context. The specific biases this may introduce are discussed in the next section.",
224
  ]
225
  },
226
  biasesDiscussion: {
 
228
  "Provide descriptions of specific biases that are likely to be reflected in the data, and state whether any steps were taken to reduce their impact.",
229
  "For Wikipedia text, see for example [Dinan et al 2020 on biases in Wikipedia (esp. Table 1)](https://arxiv.org/abs/2005.00614), or [Blodgett et al 2020](https://www.aclweb.org/anthology/2020.acl-main.485/) for a more general discussion of the topic.",
230
  "If analyses have been run quantifying these biases, please add brief summaries and links to the studies here."
231
+ ],
232
+ example: [
233
+ "While Reddit hosts a number of thriving communities with high quality discussions, it is also widely known to have corners where sexism, hate, and harassment are significant issues. See for example the recent post from Reddit founder u/spez outlining some of the ways he thinks the website's historical policies have been responsible for this problem, Adrienne Massanari's 2015 article on GamerGate and follow-up works, or a 2019 Wired article on misogyny on Reddit.",
234
+ "While there has been some recent work in the NLP community on de-biasing models (e.g. Black is to Criminal as Caucasian is to Police: Detecting and Removing Multiclass Bias in Word Embeddings for word embeddings trained specifically on Reddit data), this problem is far from solved, and the likelihood that a trained model might learn the biases present in the data remains a significant concern.",
235
+ `We still note some encouraging signs for all of these communities: r/explainlikeimfive and r/askscience have similar structures and purposes, and r/askscience was found in 2015 to show medium supportiveness and very low toxicity when compared to other subreddits (see a hackerfall post, thecut.com write-up and supporting data). Meanwhile, the r/AskHistorians rules mention that the admins will not tolerate "racism, sexism, or any other forms of bigotry". However, further analysis of whether and to what extent these rules reduce toxicity is still needed.`,
236
+ "We also note that given the audience of the Reddit website which is more broadly used in the US and Europe, the answers will likely present a Western perspectives, which is particularly important to note when dealing with historical topics."
237
  ]
238
  },
239
  limitations: {
240
  paragraph: [
241
  "If studies of the datasets have outlined other limitations of the dataset, such as annotation artifacts, please outline and cite them here."
242
+ ],
243
+ example: [
244
+ "The answers provided in the dataset are represent the opinion of Reddit users. While these communities strive to be helpful, they should not be considered to represent a ground truth."
245
  ]
246
  },
247
  datasetCurators: {
248
  paragraph: [
249
  "List the people involved in collecting the dataset and their affiliation(s). If funding information is known, include it here."
250
+ ],
251
+ example: [
252
+ "The dataset was initially created by Angela Fan, Ethan Perez, Yacine Jernite, Jason Weston, Michael Auli, and David Grangier, during work done at Facebook AI Research (FAIR)."
253
  ]
254
  },
255
  licensingInformation: {
256
  paragraph: [
257
  "Provide the license and link to the license webpage if available."
258
+ ],
259
+ example: [
260
+ "The licensing status of the dataset hinges on the legal status of the Pushshift.io data which is unclear."
261
  ]
262
  },
263
  citationInformation: {
 
272
  }
273
  `,
274
  "If the dataset has a [DOI](https://www.doi.org/), please provide it here."
275
+ ],
276
+ example: [
277
+ `@inproceedings{eli5_lfqa`,
278
+ `author = {Angela Fan and
279
+ Yacine Jernite and
280
+ Ethan Perez and
281
+ David Grangier and
282
+ Jason Weston and
283
+ Michael Auli}`,
284
+ `editor = {Anna Korhonen and
285
+ David R. Traum and
286
+ Llu{\'{\i}}s M{\`{a}}rquez}`,
287
+ `title = {{ELI5:} Long Form Question Answering}`,
288
+ `booktitle = {Proceedings of the 57th Conference of the Association for Computational
289
+ Linguistics, {ACL} 2019, Florence, Italy, July 28- August 2, 2019,
290
+ Volume 1: Long Papers}`,
291
+ `pages = {3558--3567}`,
292
+ `publisher = {Association for Computational Linguistics}`,
293
+ `year = {2019}`,
294
+ `url = {https://doi.org/10.18653/v1/p19-1346}`,
295
+ `doi = {10.18653/v1/p19-1346}
296
+ }`
297
  ]
298
  },
299
  }
src/Section.js ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react';
2
+ import {BsChevronDown, BsChevronUp} from 'react-icons/bs'
3
+
4
+ export default function Section({ title, section, handleSection }) {
5
+
6
+ return (
7
+ <>
8
+ <div className="mt-1">
9
+ <div onClick={() => handleSection()} className="cursor-pointer grid grid-cols-12 gap-16 pt-6 borders text-gray-600">
10
+ <div className="col-span-11"> {title} </div>
11
+ {section ?
12
+ <BsChevronUp className="col-span-1 ml-2" />
13
+ :
14
+ <BsChevronDown className="col-span-1 ml-2" />
15
+ }
16
+ </div>
17
+ </div>
18
+ <style>{`
19
+ .borders {
20
+ border-bottom: solid 1px;
21
+ border-color: #e2e8f0;
22
+ }
23
+ `}</style>
24
+ </>
25
+ );
26
+ }