updating repro code for MLM data

nyu-mll · pruksmhc · Apr 10, 2020 · Oct 30, 2019 · Oct 31, 2019 · Nov 4, 2019
commit 9446cb78060eef0e5048b98832e1e1af93d9783b
@@ -1,21 +1,21 @@
 # Downloading Wikipedia Corpus
-We use the preprocessing code from https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT#getting-the-data 
-and the bash scripts provided here is used to help with streamlining the data generation in the NVIDIA repository. 
+We use the preprocessing code from https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT#getting-the-data
+and the bash scripts provided here is used to help with streamlining the data generation in the NVIDIA repository.
 
-First, git clone https://github.com/NVIDIA/DeepLearningExamples.git. 
-Then, move create_wiki_data.sh and get_small_english_wiki.sh into DeepLearningExamples/PyTorch/LanguageModeling/BERT/data. 
+First, git clone https://github.com/NVIDIA/DeepLearningExamples.git.
+Then, move create_wiki_data.sh and get_small_english_wiki.sh into DeepLearningExamples/PyTorch/LanguageModeling/BERT/data.
 
-You will have to set 'BERT_PREP_WORKING_DIR' as an environment variable to specify the directory you would like to save the 
-Wikipedia data to. 
 
 Then, follow the instructions below:
 
 Run `bash create_wiki_data.sh $lang $save_directory`
+The NVIDIA code supports English (en) and Chinese (zh) wikipedia.
 
 For example, to download and process English Wikipedia and save it in `~/Download` directory, run
 `bash create_wiki_data.sh en ~/Download`
 
 The above command will download the entire English Wikipedia.
 
-In our experiments, we only use a small subset (around 5% of) the entire English Wikipedia, which has the same number of sentences as Wikitext103. 
+In our experiments, we only use a small subset (around 5% of) the entire English Wikipedia, which has the same number of sentences as Wikitext103.
 To get this subset, run `bash get_small_english_wiki.sh $path_to_wikicorpus_en`. where $path_to_wikicorpus_en is the directory where you saved the full processed `wikicorpus_en` corpus.
+
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 lang=$1 #the language, 'en' for English wikipedia
-save_dir=$2 
+export BERT_PREP_WORKING_DIR=$2
 
 # clone wikiextractor if it doesn't exist
 if [ ! -d "wikiextractor" ]; then
@@ -23,19 +23,19 @@ fi
 
 echo "Downloading $lang wikpedia in directory $save_dir"
 # Download
-python3 bertPrep.py --action download --dataset wikicorpus_$lang --save_dir $save_dir
+python3 bertPrep.py --action download --dataset wikicorpus_$lang
 
 
 # Properly format the text files
-python3 bertPrep.py --action text_formatting --dataset wikicorpus_$lang --save_dir $save_dir
+python3 bertPrep.py --action text_formatting --dataset wikicorpus_$lang
 
 
 # Shard the text files (group wiki+books then shard)
-python3 bertPrep.py --action sharding --dataset wikicorpus_$lang --save_dir $save_dir
+python3 bertPrep.py --action sharding --dataset wikicorpus_$lang
 
 
 # Combine sharded files into one
-save_dir=$save_dir/sharded_training_shards_256_test_shards_256_fraction_0.2/wikicorpus_$lang
+save_dir=$BERT_PREP_WORKING_DIR/sharded_training_shards_256_test_shards_256_fraction_0.2/wikicorpus_$lang
 cat $save_dir/*training*.txt > $save_dir/train_$lang.txt
 cat $save_dir/*test*.txt > $save_dir/test_$lang.txt
 rm -rf $save_dir/wiki*training*.txt
@@ -46,3 +46,4 @@ sed -i 's/<[^>]*>//g' $save_dir/train_$lang.txt
 sed -i 's/<[^>]*>//g' $save_dir/test_$lang.txt
 
 echo "Your corpus is saved in $save_dir"
+