Skip to content

Commit

Permalink
Merged PR 8388: Packed GEMM (FBGEMM) enabling on windows + GEMM optio…
Browse files Browse the repository at this point in the history
…n parameter +some fixes

* FBGEMM based packed GEMM is enabled on both windows and Linux
* On top of the previous PR, some more changes has been added based on the PR review.
* Added an option parameter to choose the GEMM type (autotune, int16, packed, int8, mkl)

--- Prev PR ---
* Added FBGEMM as a submodule (internal FBGEMM repo)
* Implemented FBGEMM based packed GEMM
* Implemented vectorized transpose for xxx3 transpose (MKL)
* Auto tuner chooses among int16 GEMM, blas (MKL) GEMM and packed GEMM
  • Loading branch information
ykim362 authored and emjotde committed Jul 4, 2019
2 parents 37808ea + 222ed40 commit 085388b
Show file tree
Hide file tree
Showing 33 changed files with 1,787 additions and 125 deletions.
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@
[submodule "src/3rd_party/nccl"]
path = src/3rd_party/nccl
url = https://github.com/marian-nmt/nccl
[submodule "src/3rd_party/fbgemm"]
path = src/3rd_party/fbgemm
url = https://machinetranslation.visualstudio.com/DefaultCollection/MachineTranslation/_git/FBGEMM-internal
branch = master
84 changes: 47 additions & 37 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ if (POLICY CMP0074)
cmake_policy(SET CMP0074 NEW) # CMake 3.12
endif ()


project(marian CXX C)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
Expand All @@ -22,6 +21,7 @@ option(USE_MPI "Use MPI library" OFF)
option(COMPILE_EXAMPLES "Compile examples" OFF)
option(COMPILE_TESTS "Compile tests" OFF)
option(COMPILE_SERVER "Compile marian-server" ON)
option(USE_FBGEMM "Use FBGEMM" ON)

# Project versioning
find_package(Git QUIET)
Expand Down Expand Up @@ -59,44 +59,54 @@ if(MSVC)

find_library(SHLWAPI Shlwapi.lib)
set(EXT_LIBS ${EXT_LIBS} SHLWAPI)
else()
else(MSVC)

# Detect support CPU instrinsics for the current platform. This will
# only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we
# minimally use -msse4.1. This seems to work with MKL.
set(INTRINSICS "")
if(BUILD_ARCH STREQUAL "native")
message(STATUS "Checking support for CPU intrinsics")
include(FindSSE)
if(SSE2_FOUND)
message(STATUS "SSE2 support found")
set(INTRINSICS "${INTRINSICS} -msse2")
endif(SSE2_FOUND)
if(SSE3_FOUND)
message(STATUS "SSE3 support found")
set(INTRINSICS "${INTRINSICS} -msse3")
endif(SSE3_FOUND)
if(SSE4_1_FOUND)
message(STATUS "SSE4.1 support found")
set(INTRINSICS "${INTRINSICS} -msse4.1")
endif(SSE4_1_FOUND)
if(AVX_FOUND)
message(STATUS "AVX support found")
set(INTRINSICS "${INTRINSICS} -mavx")
endif(AVX_FOUND)
if(AVX2_FOUND)
message(STATUS "AVX2 support found")
set(INTRINSICS "${INTRINSICS} -mavx2")
endif(AVX2_FOUND)
if(AVX512_FOUND)
message(STATUS "AVX512 support found")
set(INTRINSICS "${INTRINSICS} -mavx512f")
list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx512f)
endif(AVX512_FOUND)
else()
set(INTRINSICS "-msse4.1")
endif()

# Detect support CPU instrinsics for the current platform. This will
# only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we
# minimally use -msse4.1. This seems to work with MKL.
set(INTRINSICS "")
if(BUILD_ARCH STREQUAL "native")
message(STATUS "Checking support for CPU intrinsics")
include(FindSSE)
if(SSE2_FOUND)
message(STATUS "SSE2 support found")
set(INTRINSICS "${INTRINSICS} -msse2")
endif(SSE2_FOUND)
if(SSE3_FOUND)
message(STATUS "SSE3 support found")
set(INTRINSICS "${INTRINSICS} -msse3")
endif(SSE3_FOUND)
if(SSE4_1_FOUND)
message(STATUS "SSE4.1 support found")
set(INTRINSICS "${INTRINSICS} -msse4.1")
endif(SSE4_1_FOUND)
if(AVX_FOUND)
message(STATUS "AVX support found")
set(INTRINSICS "${INTRINSICS} -mavx")
endif(AVX_FOUND)
if(AVX2_FOUND)
message(STATUS "AVX2 support found")
set(INTRINSICS "${INTRINSICS} -mavx2")
endif(AVX2_FOUND)
else()
set(INTRINSICS "-msse4.1")
endif()
if(USE_FBGEMM)
set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
add_definitions(-DUSE_FBGEMM=1)
endif(USE_FBGEMM)

set(DISABLE_GLOBALLY "-Wno-unused-result")
set(DISABLE_GLOBALLY "-Wno-unused-result")

# These are used in src/CMakeLists.txt on a per-target basis
list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)
# These are used in src/CMakeLists.txt on a per-target basis
list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)

# This warning does not exist prior to gcc 5.0
if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
Expand All @@ -111,7 +121,7 @@ list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -W
set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -pg -g -rdynamic")
set(CMAKE_CXX_FLAGS_PROFGEN "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
set(CMAKE_CXX_FLAGS_PROFUSE "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
endif()
endif(MSVC)

# Downloading SentencePiece if requested and set to compile with it.
# Requires all the dependencies imposed by SentencePiece
Expand Down
15 changes: 9 additions & 6 deletions cmake/FindMKL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ else()
set(COR_LIB "mkl_core")
endif()

if(MSVC)
set(ProgramFilesx86 "ProgramFiles(x86)")
set(INTEL_ROOT_DEFAULT $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows)
else()
set(INTEL_ROOT_DEFAULT "/opt/intel")
if(MSVC)
set(ProgramFilesx86 "ProgramFiles(x86)")
set(INTEL_ROOT_DEFAULT $ENV{${ProgramFilesx86}}/IntelSWTools/compilers_and_libraries/windows)
else()
set(INTEL_ROOT_DEFAULT "/opt/intel")
endif()
set(INTEL_ROOT ${INTEL_ROOT_DEFAULT} CACHE PATH "Folder contains intel libs")
find_path(MKL_ROOT include/mkl.h PATHS $ENV{MKLROOT} ${INTEL_ROOT}/mkl
Expand Down Expand Up @@ -89,7 +89,10 @@ find_library(MKL_CORE_LIBRARY
NO_DEFAULT_PATH)

set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIR})
set(MKL_LIBRARIES ${MKL_INTERFACE_LIBRARY} ${MKL_SEQUENTIAL_LAYER_LIBRARY} ${MKL_CORE_LIBRARY})
# Added -Wl block to avoid circular dependencies.
# https://stackoverflow.com/questions/5651869/what-are-the-start-group-and-end-group-command-line-options
# https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
set(MKL_LIBRARIES -Wl,--start-group ${MKL_INTERFACE_LIBRARY} ${MKL_SEQUENTIAL_LAYER_LIBRARY} ${MKL_CORE_LIBRARY} -Wl,--end-group)

# message("1 ${MKL_INCLUDE_DIR}")
# message("2 ${MKL_INTERFACE_LIBRARY}")
Expand Down
23 changes: 22 additions & 1 deletion cmake/FindSSE.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
ELSE (AVX2_TRUE)
set(AVX2_FOUND false CACHE BOOL "AVX2 available on host")
ENDIF (AVX2_TRUE)

STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE ${CPUINFO})
STRING(COMPARE EQUAL "avx512" "${SSE_THERE}" AVX512_TRUE)
IF (AVX512_TRUE)
set(AVX512_FOUND true CACHE BOOL "AVX512 available on host")
ELSE (AVX512_TRUE)
set(AVX512_FOUND false CACHE BOOL "AVX512 available on host")
ENDIF (AVX512_TRUE)

ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE
Expand Down Expand Up @@ -108,6 +116,14 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
ELSE (AVX2_TRUE)
set(AVX2_FOUND false CACHE BOOL "AVX2 available on host")
ENDIF (AVX2_TRUE)

STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE ${CPUINFO})
STRING(COMPARE EQUAL "avx512" "${SSE_THERE}" AVX512_TRUE)
IF (AVX512_TRUE)
set(AVX512_FOUND true CACHE BOOL "AVX512 available on host")
ELSE (AVX512_TRUE)
set(AVX512_FOUND false CACHE BOOL "AVX512 available on host")
ENDIF (AVX512_TRUE)

ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows")
# TODO
Expand All @@ -117,13 +133,15 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows")
set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
set(AVX_FOUND false CACHE BOOL "AVX available on host")
set(AVX2_FOUND false CACHE BOOL "AVX2 available on host")
set(AVX512_FOUND false CACHE BOOL "AVX512 available on host")
ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux")
set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
set(SSE3_FOUND false CACHE BOOL "SSE3 available on host")
set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
set(AVX_FOUND false CACHE BOOL "AVX available on host")
set(AVX2_FOUND false CACHE BOOL "AVX2 available on host")
set(AVX512_FOUND false CACHE BOOL "AVX512 available on host")
ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux")

if(NOT SSE2_FOUND)
Expand All @@ -144,5 +162,8 @@ endif(NOT AVX_FOUND)
if(NOT AVX2_FOUND)
MESSAGE(STATUS "Could not find hardware support for AVX2 on this machine.")
endif(NOT AVX2_FOUND)
if(NOT AVX512_FOUND)
MESSAGE(STATUS "Could not find hardware support for AVX512 on this machine.")
endif(NOT AVX512_FOUND)

mark_as_advanced(SSE2_FOUND SSE3_FOUND SSSE3_FOUND SSE4_1_FOUND, AVX_FOUND, AVX2_FOUND)
mark_as_advanced(SSE2_FOUND SSE3_FOUND SSSE3_FOUND SSE4_1_FOUND, AVX_FOUND, AVX2_FOUND, AVX512_FOUND)
12 changes: 12 additions & 0 deletions src/3rd_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@ add_subdirectory(./SQLiteCpp)
add_subdirectory(./pathie-cpp)
add_subdirectory(./zlib)

if(USE_FBGEMM)
# @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
# meant to silence CMakeFiles of 3rd_party tools.
if(NOT DEFINED CMAKE_SUPPRESS_DEVELOPER_WARNINGS)
set(CMAKE_SUPPRESS_DEVELOPER_WARNINGS 1 CACHE INTERNAL "No dev warnings")
endif()

set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "Disable fbgemm tests")
set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "Disable fbgemm benchmark")
add_subdirectory(./fbgemm)
endif(USE_FBGEMM)

if(USE_SENTENCEPIECE)
if(USE_STATIC_LIBS)
set(_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
Expand Down
1 change: 1 addition & 0 deletions src/3rd_party/fbgemm
Submodule fbgemm added at bc33ed
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include_directories(.)
include_directories(3rd_party)
include_directories(3rd_party/SQLiteCpp/include)
include_directories(3rd_party/sentencepiece)
include_directories(3rd_party/fbgemm/include)

add_library(marian STATIC
common/version.cpp
Expand Down Expand Up @@ -40,6 +41,7 @@ add_library(marian STATIC
tensors/cpu/sharp/int_gemm.cpp
tensors/cpu/sharp/avx_gemm.cpp
tensors/cpu/sharp/sse_gemm.cpp
tensors/cpu/sharp/packed_gemm.cpp

graph/expression_graph.cpp
graph/expression_operators.cpp
Expand Down
3 changes: 2 additions & 1 deletion src/command/marian_conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ int main(int argc, char** argv) {
marian::io::getYamlFromModel(config, "special:model.yml", modelFrom);
configStr << config;

auto graph = New<ExpressionGraph>(true, false);
auto graph = New<ExpressionGraph>(true);
graph->setDevice(CPU0);
graph->getBackend()->setOptimized(false);

graph->load(modelFrom);
graph->forward();
Expand Down
3 changes: 3 additions & 0 deletions src/common/config_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,9 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"Optimize speed aggressively sacrificing memory or precision");
cli.add<bool>("--skip-cost",
"Ignore model cost during translation, not recommended for beam-size > 1");
cli.add<std::string>("--gemm-type",
"Select GEMM options: auto, mklfp32, intrinint16, fp16packed, int8packed",
"auto");

cli.add<std::vector<std::string>>("--shortlist",
"Use softmax shortlist: path first best prune");
Expand Down
2 changes: 1 addition & 1 deletion src/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ add_executable(mnist_example mnist/mnist_ffnn.cpp)
foreach(exec iris_example mnist_example)
target_link_libraries(${exec} marian ${EXT_LIBS})
if(CUDA_FOUND)
target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS})
target_link_libraries(${exec} marian ${EXT_LIBS} marian_cuda ${EXT_LIBS})
endif(CUDA_FOUND)
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
endforeach(exec)
17 changes: 14 additions & 3 deletions src/graph/auto_tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,26 @@ class AutoTuner : public AutoTunerRecorder {
private:
typedef std::function<Return(Args...)> Algorithm;

const size_t max = 100;
// When the autotuner decides the fastest algorithm for a specific tensor operation (e.g. GEMM),
// the autotuner runs each algorithm at least this 'collectStatMax' number of times and
// collects the statistics.
const size_t collectStatMax = 50;

UPtr<timer::CPUTimer> timer_;

// This structure holds a hash key an algorithm function (e.g. int16, packed gemm, mkl gemm)
// for a specific operation size
// hash: a unique hash key for each operation size
// (e.g. m, n, k, transpose A, transpose B, bias size for GEMM)
// algorithm: a function that holds an algorithm
struct HashedAlgorithm {
size_t hash;
Algorithm algorithm;
};

// This structure represents the collected statistics.
// time: total accumulated time of this operator execution with the given algorithm
// runs: total time this algorithm was executed
struct Stat {
double time;
size_t runs;
Expand All @@ -53,7 +64,7 @@ class AutoTuner : public AutoTunerRecorder {
auto& stat = it->second;

// collect more stats
if(stat.runs < max)
if(stat.runs < collectStatMax)
return i;

if(stat.time < bestTime) {
Expand Down Expand Up @@ -93,7 +104,7 @@ class AutoTuner : public AutoTunerRecorder {

auto it = stats_.find(hash);
if(it != stats_.end()) {
if(it->second.runs < max) {
if(it->second.runs < collectStatMax) {
it->second.time += seconds.count();
it->second.runs += 1;
}
Expand Down
4 changes: 2 additions & 2 deletions src/graph/expression_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

namespace marian {

ExpressionGraph::ExpressionGraph(bool inference, bool optimized)
: inferenceOnly_(inference), optimized_(optimized), backend_(nullptr) {}
ExpressionGraph::ExpressionGraph(bool inference)
: inferenceOnly_(inference), backend_(nullptr) {}

void ExpressionGraph::setDevice(DeviceId deviceId, Ptr<Device> device) {
if(!backend_) {
Expand Down
7 changes: 1 addition & 6 deletions src/graph/expression_graph.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#pragma once

#include "common/config.h"
#include "common/definitions.h"

#include "tensors/backend.h"
#include "tensors/tensor_allocator.h"
Expand Down Expand Up @@ -130,7 +129,6 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
std::unordered_map<size_t, std::vector<Expr>> memoized_;

bool inferenceOnly_{false};
bool optimized_{false};
Ptr<Backend> backend_;

bool reloaded_{false};
Expand All @@ -148,7 +146,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
*
* Constructor should be used as New<ExpressionGraph>()
*/
ExpressionGraph(bool inference = false, bool optimized = false);
ExpressionGraph(bool inference = false);

void setInference(bool inference) { inferenceOnly_ = inference; }
bool isInference() { return inferenceOnly_; }
Expand All @@ -165,9 +163,6 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {

Ptr<Backend> getBackend() { return backend_; }

void setOptimized(bool optimized) { optimized_ = optimized; }
bool isOptimized() { return (optimized_ && inferenceOnly_); }

void switchParams(const std::string& newNamespace) {
namespace_ = newNamespace;
}
Expand Down
Loading

0 comments on commit 085388b

Please sign in to comment.