diff --git a/src/models/transformer.h b/src/models/transformer.h index 8b0227315..27ac153ce 100755 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -87,6 +87,7 @@ class Transformer : public EncoderOrDecoderBase { // according to paper embeddings are scaled up by \sqrt(d_m) embeddings = std::sqrt((float)dimEmb) * embeddings; // embeddings were initialized to unit length; so norms will be in order of sqrt(dimEmb) +#ifdef USE_ONNX // TODO 'Sin' op and constant sine generate different result. So, use constant when 'USE_ONNX' is not defined for now. // precompute the arguments to sin() (the cos(x) are expressed as sin(x+pi/2)) if (sinusoidalEmbeddingsFreq_.empty()) { auto numTimescales = dimEmb / 2; @@ -100,6 +101,10 @@ class Transformer : public EncoderOrDecoderBase { auto positionRange = graph_->constant({ dimWords, 1, 1 }, inits::range((float)start, (float)start + (float)dimWords)); positionRange->set_name("data_" + std::to_string(batchIndex_) + "_posrange"); auto signal = sin(positionRange * frequencies + cosOffsets); +#else // USE_ONNX + auto signal = graph_->constant({dimWords, 1, dimEmb}, + inits::sinusoidalPositionEmbeddings(start)); +#endif // USE_ONNX embeddings = embeddings + signal; } diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 8529db8b5..9a5d233dc 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -96,7 +96,7 @@ void ProdBatched(marian::Tensor C, auto strideC = n * m; auto batchC = std::max(batchA, batchB); -#if MKL_FOUND +#if 0 // TODO Accuracy regression. Batched GEMM generate different output. Investigating and disable for now. CBLAS_TRANSPOSE transA_forarr = CblasNoTrans; CBLAS_TRANSPOSE transB_forarr = CblasNoTrans;