Christopher Akiki commited on
Commit
2528d92
1 Parent(s): 21a7e2f

Add additional code filtering functions

Browse files
resources/sources_with_info_cards.json CHANGED
@@ -125,7 +125,7 @@
125
  }
126
  ],
127
  "total": 159.294113344,
128
- "data_card": "# github-no-gpl\n\n- Dataset uid: `github-no-gpl`\n\n### Description\n\n\n\n- C++\n- C#\n- Go\n- Java\n- JavaScript\n- Lua\n- PHP\n- Python 2\n- Python 3\n- Ruby\n- Rust\n- Scala\n- TypeScript\n### Homepage\n\n\n\n### Licensing\n\n\n\n### Speaker Locations\n\n\n\n### Sizes\n\n- 13.1372 % of total\n- 85.2591 % of code\n\n### BigScience processing steps\n\n#### Filters applied to: code\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n"
129
  }
130
  ],
131
  [
 
125
  }
126
  ],
127
  "total": 159.294113344,
128
+ "data_card": "# github-no-gpl\n\n- Dataset uid: `github-no-gpl`\n\n### Description\n\n\n\n- C++\n- C#\n- Go\n- Java\n- JavaScript\n- Lua\n- PHP\n- Python 2\n- Python 3\n- Ruby\n- Rust\n- Scala\n- TypeScript\n### Homepage\n\n\n\n### Licensing\n\n\n\n### Speaker Locations\n\n\n\n### Sizes\n\n- 13.1372 % of total\n- 85.2591 % of code\n\n### BigScience processing steps\n\n#### Filters applied to: code\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n --- \n - <1MB (there’s nothing bigger on BigQuery anyway) \n - whitespace agnostic deduplication \n - files with a line >1000 characters \n - exclude GPL \n - filter_token_len_avg_std \n - filter_text_len \n - filter_special_character_ratio \n - filter_longest_line \n - filter_by_all \n"
129
  }
130
  ],
131
  [