Source code for watchme.watchers.gpu.tasks

'''

Copyright (C) 2019 Vanessa Sochat.

This Source Code Form is subject to the terms of the
Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed
with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

'''

from watchme.utils import get_watchme_env
from .pynvml import (
    nvmlInit, 
    nvmlShutdown
)
from watchme.watchers.gpu import pynvml

[docs]def gpu_task(**kwargs):
    '''Get variables about the gpu of the host. No parameters are required.
       We've already instantited the Task object and have checked that
       the calling host has nvml GPU

       Parameters
       ==========
       skip: an optional list of (comma separated) fields to skip. Can be in
             net_io_counters,net_connections,net_if_address,net_if_stats
    '''
    nvmlInit()

    results = {}

    # A comma separated list of parameters to not include
    skip = kwargs.get('skip', '')
    skip = skip.split(',')

    # Run through high level metrics
    funcs = {'nvml_driver_version': pynvml.nvmlSystemGetDriverVersion,
             'nvml_system_nvml_version': pynvml.nvmlSystemGetNVMLVersion,
             'nvml_deviceCount': pynvml.nvmlDeviceGetCount,
             'nvml_hic_version': pynvml.nvmlSystemGetHicVersion,
             'nvml_unit_count': pynvml.nvmlUnitGetCount}

    for name, func in funcs.items():
        try:
            results[name] = func()
        except:
            nvmlInit()

    # Look at individual devices
    funcs = {
        'nvml_device_board_id': pynvml.nvmlDeviceGetBoardId,
        'nvml_device_multi_gpu_board': pynvml.nvmlDeviceGetMultiGpuBoard,
        'nvml_device_brand': pynvml.nvmlDeviceGetBrand,
        'nvml_device_serial': pynvml.nvmlDeviceGetSerial,
        'nvml_device_set_cpu_affinite': pynvml.nvmlDeviceSetCpuAffinity,
        'nvml_device_minor_number': pynvml.nvmlDeviceGetMinorNumber,
        'nvml_device_uuid': pynvml.nvmlDeviceGetUUID,
        'nvml_device_inforom_version': pynvml.nvmlDeviceGetInforomImageVersion,               
        'nvml_device_inforam_checksum': pynvml.nvmlDeviceGetInforomConfigurationChecksum,
        'nvml_device_display_mode': pynvml.nvmlDeviceGetDisplayMode,
        'nvml_device_display_active': pynvml.nvmlDeviceGetDisplayActive,
        'nvml_device_persistence_mode': pynvml.nvmlDeviceGetPersistenceMode,
        'nvml_device_supported_memory_clocks': pynvml.nvmlDeviceGetSupportedMemoryClocks,
        'nvml_device_fan_speed': pynvml.nvmlDeviceGetFanSpeed,
        'nvml_device_performance_state': pynvml.nvmlDeviceGetPerformanceState,
        'nvml_device_management_mode': pynvml.nvmlDeviceGetPowerManagementMode,
        'nvml_device_power_managerment_mode': pynvml.nvmlDeviceGetPowerManagementMode,
        'nvml_device_power_management_limit': pynvml.nvmlDeviceGetPowerManagementLimit,
        'nvml_device_power_management_limit_constraints': pynvml.nvmlDeviceGetPowerManagementLimitConstraints,
        'nvml_device_power_management_default_limit': pynvml.nvmlDeviceGetPowerManagementDefaultLimit,
        'nvml_device_enforced_power_limit': pynvml.nvmlDeviceGetEnforcedPowerLimit,
        'nvml_device_power_usage': pynvml.nvmlDeviceGetPowerUsage,
        'nvml_device_gpu_operation_mode': pynvml.nvmlDeviceGetGpuOperationMode,
        'nvml_device_current_operation_mode': pynvml.nvmlDeviceGetCurrentGpuOperationMode,
        'nvml_device_pending_gpu_operation_mode': pynvml.nvmlDeviceGetPendingGpuOperationMode,
        'nvml_device_memory_info': pynvml.nvmlDeviceGetMemoryInfo,
        'nvml_device_bar1_memory_info': pynvml.nvmlDeviceGetBAR1MemoryInfo,
        'nvml_device_compute_mode': pynvml.nvmlDeviceGetComputeMode,
        'nvml_device_ecc_mode': pynvml.nvmlDeviceGetEccMode,
        'nvml_device_current_ecc_mode': pynvml.nvmlDeviceGetCurrentEccMode,
        'nvml_device_pending_ecc_mode': pynvml.nvmlDeviceGetPendingEccMode,
        'nvml_device_utilization_rates': pynvml.nvmlDeviceGetUtilizationRates,
        'nvml_device_encoder_utilization': pynvml.nvmlDeviceGetEncoderUtilization,
        'nvml_device_decoder_utilization': pynvml.nvmlDeviceGetDecoderUtilization,
        'nvml_device_pci_replay_counter': pynvml.nvmlDeviceGetPcieReplayCounter,
        'nvml_device_driver_model': pynvml.nvmlDeviceGetDriverModel,
        'nvml_device_current_driver_model': pynvml.nvmlDeviceGetCurrentDriverModel,
        'nvml_device_pending_driver_model': pynvml.nvmlDeviceGetPendingDriverModel,
        'nvml_device_vbios_version': pynvml.nvmlDeviceGetVbiosVersion,
        'nvml_device_compute_running_processes': pynvml.nvmlDeviceGetComputeRunningProcesses,
        'nvml_device_grapics_running_processes': pynvml.nvmlDeviceGetGraphicsRunningProcesses,
        'nvml_device_auto_boosted_clocks_enabled': pynvml.nvmlDeviceGetAutoBoostedClocksEnabled,
        'nvml_device_supported_event_types': pynvml.nvmlDeviceGetSupportedEventTypes,
        'nvml_device_current_pcie_link_generation': pynvml.nvmlDeviceGetCurrPcieLinkGeneration,
        'nvml_device_max_pcie_link_generation': pynvml.nvmlDeviceGetMaxPcieLinkGeneration,
        'nvml_device_curr_pcie_link_width': pynvml.nvmlDeviceGetCurrPcieLinkWidth,
        'nvml_device_max_pcie_link_width': pynvml.nvmlDeviceGetMaxPcieLinkWidth,
        'nvml_device_supported_clocks_throttle_reasons': pynvml.nvmlDeviceGetSupportedClocksThrottleReasons,
        'nvml_device_current_clocks_throttle_reasons': pynvml.nvmlDeviceGetCurrentClocksThrottleReasons,
        'nvml_device_index': pynvml.nvmlDeviceGetIndex,
        'nvml_device_accounting_mode': pynvml.nvmlDeviceGetAccountingMode,
        'nvml_device_accounting_pids': pynvml.nvmlDeviceGetAccountingPids,
        'nvml_device_accounting_buffer_size': pynvml.nvmlDeviceGetAccountingBufferSize
    }

    # Functions that need additional args
    # nvmlDeviceGetCpuAffinity(handle, cpuSetSize):
    # nvmlDeviceGetInforomVersion(handle, infoRomObject)
    # nvmlDeviceGetClockInfo(handle, type)
    # nvmlDeviceGetMaxClockInfo(handle, type)
    # nvmlDeviceGetApplicationsClock(handle, type)
    # nvmlDeviceGetDefaultApplicationsClock(handle, type)
    # nvmlDeviceGetSupportedGraphicsClocks(handle, memoryClockMHz)
    # nvmlDeviceGetTemperature(handle, sensor)
    # nvmlDeviceGetTemperatureThreshold(handle, threshold)
    # nvmlDeviceGetTotalEccErrors(handle, errorType, counterType)
    # nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, locationType)
    # nvmlDeviceRegisterEvents(handle, eventTypes, eventSet):
    # nvmlEventSetWait(eventSet, timeoutms):
    # nvmlDeviceOnSameBoard(handle1, handle2):
    # nvmlDeviceGetAccountingStats(handle, pid):

    # Setting functions that return None
    # nvmlDeviceSetCpuAffinity(handle)
    # nvmlDeviceClearCpuAffinity(handle)
    # nvmlDeviceValidateInforom(handle)        
    # nvmlUnitSetLedState(unit, color):
    # nvmlDeviceSetPersistenceMode(handle, mode):
    # nvmlDeviceSetComputeMode(handle, mode):    
    # nvmlDeviceSetEccMode(handle, mode):
    # nvmlDeviceClearEccErrorCounts(handle, counterType):
    # nvmlDeviceSetDriverModel(handle, model):    
    # nvmlDeviceSetAutoBoostedClocksEnabled(handle, enabled): 
    # nvmlDeviceSetDefaultAutoBoostedClocksEnabled(handle, enabled, flags): 
    # nvmlDeviceSetApplicationsClocks(handle, maxMemClockMHz, maxGraphicsClockMHz):    
    # nvmlDeviceResetApplicationsClocks(handle):
    # nvmlDeviceSetPowerManagementLimit(handle, limit):
    # nvmlDeviceSetGpuOperationMode(handle, mode):
    # nvmlDeviceSetAccountingMode(handle, mode):
    # nvmlDeviceClearAccountingPids(handle):

    device_count = results['nvml_deviceCount']

    devices = {}
    for i in range(device_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle)
        devices[name] = {}
        for key, func in funcs.items():
            try:

                result = func(handle)

                # Special parsing of the result depending on the type
                if isinstance(result, map):
                    result = list(result)

                if key == 'nvml_device_bar1_memory_info':
                    result = {"bar1Free": result.bar1Free,
                              "bar1Total": result.bar1Total,
                              "bar1Used": result.bar1Used}

                if key == 'nvml_device_utilization_rates':
                    result = {"gpu": result.gpu,
                              "memory": result.memory}

                elif key == 'nvml_device_memory_info':
                    result = {"free": result.free,
                              "total": result.total,
                              "used": result.used}

                devices[name][key] = result
            except:
                nvmlInit()
   
    nvmlShutdown()

    results['devices'] = devices
    return _filter_result(results, skip)


def _filter_result(results, skip):
    '''a helper function to filter a dictionary based on a list of keys to 
       skip. We also add variables from the environment.
    
       Parameters
       ==========
       results: a dictionary of results
       skip: a list of keys to remove/filter from the result.
    '''

    # Add any environment variables prefixed wit WATCHMEENV_
    environ = get_watchme_env()
    results.update(environ)

    for key in skip:
        if key in results:
            del results[key]

    return results