Language Model을 빌드할때

빌드할 대상 파일이 너무 클 경우

이파일을 split으로 분할해서 Language Model을 만들수 있다.

tip > LM을 만들때 split 명령어중 깔끔하게 파일을 분리하기 위해 split -l 즉 라인단위로 잘라서 빌드를 했으나

라인길이가 천차만별이라 파일 사이즈가 제각각으로 생성이 되었다.

문제는 이렇게 다양한 파일크기중에 특정 임계치 이상일 경우 cannot allocate memory라는 메세지를 남기고

ngram-count 파일이 죽는 문제가 발생

하는수 없이 -b로 파일 크기를 기계적으로 만든후에 LM 빌드에 성공


아래는 BIG DATA LM을 위한 스크립트



#!/bin/bash

set -o nounset # Treat unset variables as an error when performing parameter expansion.
set -o errexit # Exit immediately if a simple command (see SHELL GRAMMAR above) exits with a non-zero status.



function error_message()
{
    echo "Exits abnormally at line " $red_color `caller 0` $reset_color;
}
trap "error_message" ERR


red_color=^[[31m
green_color=^[[32m
reset_color=^[[0m
SUFFIX_LOW="lowercased"
SUFFIX_TOK="token"


NGRAM=4
SMT_HOME=/data2/jchern/smt

DHA=cat
DEV=$SMT_HOME/dev
SMT_BIN=$SMT_HOME/bin
SRILM=$SMT_HOME/srilm
SCRIPTS=$SMT_HOME/scripts
TRAIN_DIR=$SMT_HOME/training
WDIR=$SMT_HOME/working-dir
MOSES_SCRIPT=$SMT_HOME/bin/moses-scripts/scripts-20111006-1552
TRAIN_BIG_DIR=$SMT_HOME/training-monolingual
EXAMPLE_DIR=$SMT_HOME/example
MOSES=$SMT_HOME/moses/moses-cmd/src/moses
export SCRIPTS_ROOTDIR=$MOSES_SCRIPT
DETOKENIZER=$SCRIPTS/detokenizer.perl
TOKENIZER=$SCRIPTS/tokenizer.perl
TOLOWER=$SCRIPTS/lowercase.perl


usage()
{
    echo `caller`;
    echo "KOR->ENG: "
    echo "  ./Big_train_language_model.sh --filename-for-lang-model=europarl-v6.en --verbose --dha";
    echo "ENG->KOR: "
    echo "  ./Big_train_language_model.sh --filename-for-lang-model=koreablog.BIG.ko --verbose --dha";
    echo "         --filename-for-lang-model : big data file To build language model";
    echo "         --dha : use dha";
    echo "         --verbose(d) : verbose";
    exit
}

set -- `getopt -n$0 -u -a --longoptions="filename-for-lang-model: dha help verbose" "dh" "$@"` || usage
[ $# -eq 0 ] && usage
[ $# -eq 1 ] && usage

STEP=0;
verbose="0";
use_dha=0

while [ $# -gt 0 ]
do
    case "$1" in
       --filename-for-lang-model)    BIG_FILE=$2;    shift 2;;
       --dha)                        use_dha=1;      shift;;
       --verbose)                    verbose=1;      shift;;
       --help)        usage;         break;;
       -d)            verbose=1;     shift;;
       -h)            usage;         break;;
       --)       break;;
       -*)       echo "unknown option : $1"; usage; shift; break;;
       *)        echo "unknown option : $1"; usage; break;;            #better be the crawl directory
    esac
done

if [[ "$BIG_FILE" == "europarl-v6.en" ]]
then
    # KOREAN -> ENGLISH
    LANG1=ko
    LANG2=en
else
    # ENGLISH -> KOREAN
    LANG1=en
    LANG2=ko
fi

cp /data2/jchern/smt/HANL.CONF /data2/jchern/dha_morph/trunk/config

if [[ "$use_dha" == "1" ]]
then
    DHA=/data2/jchern/dha_morph/trunk/src/testhanl
    echo "USE DHA..."
    cp /data2/jchern/smt/HANL.CONF /data2/jchern/dha_morph/trunk/config
else
    echo "USE CAT..."
fi




echo "LANGUAGE 1 --> "$LANG1;
echo "LANGUAGE 2 --> "$LANG2;


echo "==============================="
echo "$red_color [1.BUILD LANGUAGE MODEL]... $reset_color"
echo "==============================="
echo "------------------------"
echo "$green_color [1.1 MAKE DIC:lm]... $reset_color"
echo "------------------------"
mkdir -p $WDIR/lm
if [[ "$verbose" -eq "1" ]]
then
    if [[ -e $WDIR/lm/$BIG_FILE.$SUFFIX_TOK ]]
    then
        echo "alreay exist file - $WDIR/lm/$BIG_FILE.$SUFFIX_TOK";
    else
        echo "$SCRIPTS/refine_$LANG2 < $TRAIN_BIG_DIR/$BIG_FILE > $WDIR/lm/$BIG_FILE.$SUFFIX_TOK"
    fi
fi
if [[ -e $WDIR/lm/$BIG_FILE.$SUFFIX_TOK ]]
then
    echo "";
else
    $SCRIPTS/refine_$LANG2 < $TRAIN_BIG_DIR/$BIG_FILE > $WDIR/lm/$BIG_FILE.$SUFFIX_TOK
fi


cd $SMT_HOME
rm -f file-list
cd $SMT_HOME/counts
rm -f *
echo "------------------------"
echo "$green_color [1.3 SPLIT FILE]... $reset_color"
echo "------------------------"

cd $SMT_HOME

if [[ "$verbose" -eq "1" ]]
then
    echo "  1.3.1 split -l 10000000 $WDIR/lm/$BIG_FILE.$SUFFIX_TOK jchern_";
    echo "";
fi
#split -l 50000 $WDIR/lm/$BIG_FILE.$SUFFIX_TOK jchern_
split -b 10000000 $WDIR/lm/$BIG_FILE.$SUFFIX_TOK jchern_
rm -f $SMT_HOME/file-list


if [[ "$verbose" -eq "1" ]]
then
    echo "  1.3.2 ls jchern_* > file-list";
    echo ""
fi
ls jchern_* > $SMT_HOME/tmp

cat $SMT_HOME/tmp | while read line
do
    echo $SMT_HOME/$line >> $SMT_HOME/file-list
done
rm -f $SMT_HOME/tmp


if [[ "$verbose" -eq "1" ]]
then
    echo "  1.3.3 $SRILM/bin/make-batch-counts $SMT_HOME/file-list";
fi

rm -f $SMT_HOME/log
$SRILM/bin/make-batch-counts $SMT_HOME/file-list >> $SMT_HOME/log

if [[ "$verbose" -eq "1" ]]
then
    echo "  1.3.4 $SRILM/bin/merge-batch-counts $SMT_HOME/counts";
fi

$SRILM/bin/merge-batch-counts $SMT_HOME/counts 2> $SMT_HOME/message
echo "in_file=`grep "final counts in" $SMT_HOME/message | awk -F" " '{printf$4}'`"
in_file=`grep "final counts in" $SMT_HOME/message | awk -F" " '{printf$4}'`
rm -f $SMT_HOME/message

if [[ "$verbose" -eq "1" ]]
then
    echo "  1.3.5 $SRILM/bin/make-big-lm -read  $SMT_HOME/counts/merge-itern-n.ngrams.gz -lm $WDIR/lm/$BIG_FILE.lm";
fi
$SRILM/bin/make-big-lm -read  $in_file -lm $WDIR/lm/$BIG_FILE.lm
rm -f $SMT_HOME/jchern_*
rm -f $SMT_HOME/biglm.*


echo ""
echo "$green_color job completed............. $reset_color"
echo ""