cmake_minimum_required(VERSION 3.25)

# Version information
# Read makefiles/version.mk file
file(READ ${CMAKE_SOURCE_DIR}/makefiles/version.mk VERSION_CONTENT)
string(REGEX REPLACE ".*NCCL_MAJOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MAJOR "${VERSION_CONTENT}")
string(REGEX REPLACE ".*NCCL_MINOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MINOR "${VERSION_CONTENT}")
string(REGEX REPLACE ".*NCCL_PATCH[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_PATCH "${VERSION_CONTENT}")
string(REGEX REPLACE ".*NCCL_SUFFIX[ ]*:=[ ]*([a-zA-Z0-9]*).*" "\\1" NCCL_SUFFIX "${VERSION_CONTENT}")
string(REGEX REPLACE ".*PKG_REVISION[ ]*:=[ ]*([0-9]+).*" "\\1" PKG_REVISION "${VERSION_CONTENT}")
math(EXPR NCCL_VERSION_CODE "(${NCCL_MAJOR} * 10000) + (${NCCL_MINOR} * 100) + ${NCCL_PATCH}")

# Make version information available to C++ source files
add_compile_definitions(
    NCCL_USE_CMAKE
    NCCL_MAJOR=${NCCL_MAJOR}
    NCCL_MINOR=${NCCL_MINOR}
    NCCL_PATCH=${NCCL_PATCH}
    NCCL_VERSION_CODE=${NCCL_VERSION_CODE}
)

set(ENV{NCCL_USE_CMAKE} "1")

project(NCCL VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
        LANGUAGES CUDA CXX C)

include(GNUInstallDirs)

# Make CMAKE_BUILD_TYPE to release by default if not set
if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE "Release")
endif()

# Package building options
option(BUILD_DEBIAN_PACKAGE "Build Debian package" OFF)
option(BUILD_REDHAT_PACKAGE "Build Redhat package" OFF)
option(BUILD_TXZ_PACKAGE "Build TXZ package" OFF)
option(BUILD_SRCTXZ_PACKAGE "Build source tarball package" OFF)
option(BUILD_PACKAGES "Build all packages" OFF)
option(VERBOSE "Enable verbose output" OFF)
option(KEEP "Keep intermediate files" OFF)
option(DEBUG "Enable debug build" OFF)
option(ASAN "Enable Address Sanitizer" OFF)
option(UBSAN "Enable Undefined Behavior Sanitizer" OFF)
option(TSAN "Enable Thread Sanitizer" OFF)
option(TRACE "Enable tracing" OFF)
option(WERROR "Treat warnings as errors" OFF)
option(PROFAPI "Enable profiling API" ON)
option(NVTX "Enable NVTX" ON)
option(RDMA_CORE "Enable RDMA core" OFF)
option(NET_PROFILER "Enable network profiler" OFF)
option(MLX5DV "Enable MLX5DV" OFF)
option(MAX_EXT_NET_PLUGINS "Maximum external network plugins" 0)
option(EMIT_LLVM_IR "Generate LLVM IR for device APIs" OFF)
option(BUILD_NCCL4PY "Enable nccl4py Python bindings targets" OFF)
option(BUILD_NCCL_EP "Build NCCL EP contrib library" OFF)

# Detect OS (use add_compile_definitions so CUDA .cu files get the define when built by NVCC)
if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
    set(NCCL_OS_WINDOWS ON)
    add_compile_definitions(NCCL_OS_WINDOWS)
    # Prevent windows.h from defining min/max macros so std::min/std::max work
    add_compile_definitions(WIN32_LEAN_AND_MEAN NOMINMAX)
elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
    set(NCCL_OS_LINUX ON)
    add_compile_definitions(NCCL_OS_LINUX)
else()
    message(FATAL_ERROR "Unsupported OS: ${CMAKE_SYSTEM_NAME}")
endif()

# Detect compiler
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
    set(NCCL_COMPILER_MSVC ON)
    add_compile_definitions(NCCL_COMPILER_MSVC)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
    set(NCCL_COMPILER_GCC ON)
    add_compile_definitions(NCCL_COMPILER_GCC)
else()
    message(FATAL_ERROR "Unsupported compiler: ${CMAKE_CXX_COMPILER_ID}")
endif()

find_package(CUDAToolkit REQUIRED)
find_package(Threads REQUIRED)
find_package(Python3 REQUIRED COMPONENTS Interpreter)

# CUDA version detection
string(REGEX MATCH "([0-9]+\\.[0-9]+)" CUDA_VERSION "${CUDAToolkit_VERSION}")

# Extract major and minor version numbers
string(REGEX MATCH "([0-9]+)" CUDA_MAJOR "${CUDA_VERSION}")
string(REGEX MATCH "([0-9]+)$" CUDA_MINOR "${CUDA_VERSION}")
string(REGEX REPLACE ".*\\.([0-9]+)$" "\\1" CUDA_MINOR "${CUDA_VERSION}")

# Add CUDA version definitions after find_package
add_compile_definitions(
    CUDA_MAJOR=${CUDA_MAJOR}
    CUDA_MINOR=${CUDA_MINOR}
)

# CUDA 13.0 requires C++17
if(${CUDA_MAJOR} GREATER_EQUAL 13)
    set(CMAKE_CXX_STANDARD 17)
    set(CMAKE_CUDA_STANDARD 17)
    set(CMAKE_CXX_EXTENSIONS OFF)
else()
    set(CMAKE_CXX_STANDARD 14)
    set(CMAKE_CUDA_STANDARD 14)
    set(CMAKE_CXX_EXTENSIONS OFF)
endif()

set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# CUDA architecture flags
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
    message(STATUS "CMAKE_CUDA_ARCHITECTURES not defined or empty, setting default values based on CUDA version")

    if(${CUDA_MAJOR} LESS 9)
        set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61")
    elseif(${CUDA_MAJOR} EQUAL 9)
        set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70")
    elseif(${CUDA_MAJOR} EQUAL 10)
        set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70")
    elseif(${CUDA_MAJOR} EQUAL 11)
        if(${CUDA_MINOR} LESS 8)
            set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80")
        else()
            set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80;90")
        endif()
    elseif(${CUDA_MAJOR} EQUAL 12)
        if(${CUDA_MINOR} LESS 8)
            set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90")
        else()
            set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;120")
        endif()
    elseif(${CUDA_MAJOR} EQUAL 13)
        set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120")
    else()
        # For future CUDA versions, include all architectures up to the latest known
        set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120")
    endif()
endif()
message(STATUS "Using CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")

# Compiler-specific flags
if(NCCL_COMPILER_GCC)
    # GCC/Clang-specific flags
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -Wvla -g")
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -fPIC")
elseif(NCCL_COMPILER_MSVC)
    # MSVC-specific flags
    # /Zc:preprocessor enables conformant preprocessor for proper variadic macro handling (needed for NVTX)
    # /wd4146 disables "unary minus on unsigned" warning - intentional for two's complement and alignment masks
    # /wd4197 disables "top-level volatile in cast ignored" - benign when using atomic intrinsics
    # /wd5105 disables "macro expansion producing 'defined'" - triggered by Windows SDK headers (winbase.h)
    # /wd4805 disables "unsafe mix of type 'bool' and type 'int'" - intentional bool/int mixing for flags
    # /wd4018 disables "signed/unsigned mismatch" - safe comparisons where signed value is always positive
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W3 /wd4267 /wd4244 /wd4996 /wd4146 /wd4197 /wd5105 /wd4805 /wd4018 /FS /Zc:preprocessor")
    # --compress-mode=balance is not supported by ptxas in CUDA 13.1 on Windows; omit for compatibility.
    # -Xcompiler /Zc:preprocessor passes the conformant preprocessor flag to cl.exe when invoked by nvcc
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --compress-mode=balance --ptxas-options=-maxrregcount=96 -Xcompiler /Zc:preprocessor")
endif()

# Sanitizer options (GCC/Clang only)
if(NCCL_COMPILER_GCC)
    if(ASAN)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address -static-libasan")
    endif()

    if(UBSAN)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined -static-libubsan")
    endif()

    if(TSAN)
        if(ASAN)
            message(FATAL_ERROR "TSAN and ASAN cannot be enabled simultaneously")
        endif()
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=thread -static-libtsan")
    endif()
endif()

# Additional options
if(TRACE)
    add_definitions(-DENABLE_TRACE)
endif()

if(NOT NVTX)
    add_definitions(-DNVTX_DISABLE)
endif()

if(WERROR)
    if(NCCL_COMPILER_MSVC)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX")
    else()
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
    endif()
endif()

if(PROFAPI)
    add_definitions(-DPROFAPI)
endif()

set(EXTRA_LIBS)

# RDMA and MLX5DV are Linux-specific features
if(RDMA_CORE)
    add_definitions(-DNCCL_BUILD_RDMA_CORE=1)
    find_library(VERBS_LIBRARY NAMES verbs)
    if(VERBS_LIBRARY)
        list(APPEND EXTRA_LIBS verbs)
    endif()
endif()

if(MLX5DV)
    add_definitions(-DNCCL_BUILD_MLX5DV=1)
    find_library(MLX5_LIBRARY NAMES mlx5)
    if(MLX5_LIBRARY)
        list(APPEND EXTRA_LIBS mlx5)
    endif()
endif()

if(NET_PROFILER)
    add_definitions(-DNCCL_ENABLE_NET_PROFILING=1)
endif()

if(MAX_EXT_NET_PLUGINS GREATER 0)
    add_definitions(-DNCCL_NET_MAX_PLUGINS=${MAX_EXT_NET_PLUGINS})
endif()

# GIN/DOCA (GDAKI): enable on non-Windows; skip only for Windows platform (no doca_gpunetio / InfiniBand)
if(NCCL_OS_WINDOWS)
    # Windows: explicitly disable GIN so all code (host + device) and headers see it as 0
    add_definitions(-DNCCL_GIN_PROXY_ENABLE=0)
else()
    add_definitions(-DDOCA_VERBS_USE_CUDA_WRAPPER)
    add_definitions(-DDOCA_VERBS_USE_NET_WRAPPER)
    add_definitions(-DNCCL_GIN_PROXY_ENABLE=1)
endif()
if(EMIT_LLVM_IR)
    add_definitions(-DEMIT_LLVM_IR=1)
endif()

# Library dependencies
find_library(RT_LIBRARY NAMES rt)
if(RT_LIBRARY)
    list(APPEND EXTRA_LIBS rt)
endif()

# Debug/Release specific flags
if(NCCL_COMPILER_GCC)
    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0")
    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3")
elseif(NCCL_COMPILER_MSVC)
    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /Od /Zi /FS")
    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /O2 /FS")   # Maximum available optimization for MSVC
endif()

set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS} -O0 -G -g")
set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS} -O3")

if(NCCL_OS_LINUX)
  add_subdirectory(plugins/mixed/example)
  add_subdirectory(plugins/net)
  add_subdirectory(plugins/profiler/example)
  add_subdirectory(plugins/tuner/example)
  add_subdirectory(plugins/env/example)
endif()

add_subdirectory(src)

# Add package building subdirectories
if(BUILD_DEBIAN_PACKAGE OR BUILD_PACKAGES)
  add_subdirectory(pkg/debian)
endif()

if(BUILD_REDHAT_PACKAGE OR BUILD_PACKAGES)
  add_subdirectory(pkg/redhat)
endif()

if(BUILD_TXZ_PACKAGE OR BUILD_PACKAGES)
  add_subdirectory(pkg/txz)
endif()

if(BUILD_SRCTXZ_PACKAGE OR BUILD_PACKAGES)
  add_subdirectory(pkg/srctxz)
endif()

if(EMIT_LLVM_IR)
    add_subdirectory(bindings/ir)
    add_dependencies(llvm_ir nccl_header)
    add_custom_target(nccl_with_ir ALL DEPENDS nccl llvm_ir)
    message(STATUS "LLVM IR generation will be included in default build")
endif()

if(BUILD_NCCL4PY)
  add_subdirectory(bindings/nccl4py)
endif()

if(BUILD_NCCL_EP)
  add_subdirectory(contrib/nccl_ep)
endif()

###################### CMake package config for find_package(NCCL) ######################
include(CMakePackageConfigHelpers)

set(NCCL_CMAKE_CONFIG_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/NCCL")

# Export installed targets (NCCL::nccl, NCCL::nccl_static)
install(EXPORT NCCLTargets
  FILE NCCLTargets.cmake
  NAMESPACE NCCL::
  DESTINATION "${NCCL_CMAKE_CONFIG_DIR}"
)

# Generate & install NCCLConfig.cmake / NCCLConfigVersion.cmake
configure_package_config_file(
  "${CMAKE_CURRENT_SOURCE_DIR}/cmake/NCCLConfig.cmake.in"
  "${CMAKE_CURRENT_BINARY_DIR}/NCCLConfig.cmake"
  INSTALL_DESTINATION "${NCCL_CMAKE_CONFIG_DIR}"
)

write_basic_package_version_file(
  "${CMAKE_CURRENT_BINARY_DIR}/NCCLConfigVersion.cmake"
  VERSION "${PROJECT_VERSION}"
  COMPATIBILITY SameMajorVersion
)

install(FILES
  "${CMAKE_CURRENT_BINARY_DIR}/NCCLConfig.cmake"
  "${CMAKE_CURRENT_BINARY_DIR}/NCCLConfigVersion.cmake"
  DESTINATION "${NCCL_CMAKE_CONFIG_DIR}"
)

# Also export targets for build-tree usage (optional convenience).
export(EXPORT NCCLTargets
  FILE "${CMAKE_CURRENT_BINARY_DIR}/NCCLTargets.cmake"
  NAMESPACE NCCL::
)
