diff --git a/Deeploy/Targets/Generic/Templates/iSoftmaxPreAllocatedBuffTemplate.py b/Deeploy/Targets/Generic/Templates/iSoftmaxPreAllocatedBuffTemplate.py index 9cf609deea..b9d5d07a91 100644 --- a/Deeploy/Targets/Generic/Templates/iSoftmaxPreAllocatedBuffTemplate.py +++ b/Deeploy/Targets/Generic/Templates/iSoftmaxPreAllocatedBuffTemplate.py @@ -11,9 +11,8 @@ class iSoftmaxPreAllocatedBuffTemplate(NodeTemplate): - @staticmethod def computeTransientBuffersSize( - ctxt: NetworkContext, + self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: lastDimBuffer_dim = 8 * 4 * operatorRepresentation['lastDimLength'] @@ -22,8 +21,7 @@ def computeTransientBuffersSize( def hoistTransientBuffers(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: - lastDimBuffer_name, lastDimBuffer_dim = iSoftmaxPreAllocatedBuffTemplate.computeTransientBuffersSize( - ctxt, operatorRepresentation)[0] + lastDimBuffer_name, lastDimBuffer_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0] ctxt.hoistTransientBuffer(lastDimBuffer_name, lastDimBuffer_dim) operatorRepresentation['lastDimBuffer'] = lastDimBuffer_name diff --git a/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py index 0319fea00b..1552d4c677 100644 --- a/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py @@ -26,9 +26,8 @@ def alignToContext(self, ctxt: NetworkContext, return ctxt, operatorRepresentation, [] - @staticmethod def computeTransientBuffersSize( - ctxt: NetworkContext, + self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: im2col_dim = 2 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']) @@ -37,7 +36,7 @@ def computeTransientBuffersSize( def hoistTransientBuffers(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: - im2col_name, im2col_dim = PULP2DConvTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation)[0] + im2col_name, im2col_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0] ctxt.hoistTransientBuffer(im2col_name, im2col_dim) operatorRepresentation['ctxtBuffer'] = im2col_name @@ -62,6 +61,22 @@ def alignToContext(self, ctxt: NetworkContext, return ctxt, operatorRepresentation, [] + def computeTransientBuffersSize( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + # The depthwise pulp-nn kernel reuses one column-shaped im2col scratch + # per core. Per core it needs `dim_kernel_x * (dim_in_y + pad_top + + # pad_bot) + dim_kernel_x` bytes (the trailing `+ dim_kernel_x` is the + # safety zone for the last v4u write). + pad_top = operatorRepresentation['padding_y_top'] + pad_bot = operatorRepresentation['padding_y_bottom'] + per_core = (operatorRepresentation['dim_kernel_x'] * + (operatorRepresentation['dim_im_in_y'] + pad_top + pad_bot) + + operatorRepresentation['dim_kernel_x']) + im2col_dim = 8 * per_core + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + return [(im2col_name, im2col_dim)] + class PULP1DConvTemplate(NodeTemplate): @@ -84,9 +99,8 @@ def alignToContext(self, ctxt: NetworkContext, return ctxt, operatorRepresentation, [] - @staticmethod def computeTransientBuffersSize( - ctxt: NetworkContext, + self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: im2col_dim = 8 * 2 * operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_y'] im2col_name = operatorRepresentation['nodeName'] + "_buffer" @@ -94,7 +108,7 @@ def computeTransientBuffersSize( def hoistTransientBuffers(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: - im2col_name, im2col_dim = PULP1DConvTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation)[0] + im2col_name, im2col_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0] ctxt.hoistTransientBuffer(im2col_name, im2col_dim) operatorRepresentation['ctxtBuffer'] = im2col_name operatorRepresentation['ctxtBufferSize'] = im2col_dim @@ -106,6 +120,20 @@ class PULP1DDWConvTemplate(PULP1DConvTemplate): def __init__(self, templateStr): super().__init__(templateStr) + def computeTransientBuffersSize( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + # The depthwise pulp-nn kernel reuses one column-shaped im2col scratch + # per core. Per core it needs `dim_kernel_y * (dim_in_y + pad_top + pad_bot) + dim_kernel_y` bytes (the trailing `+ dim_kernel_y` is the safety zone for the last v4u write). + pad_top = operatorRepresentation['padding_y_top'] + pad_bot = operatorRepresentation['padding_y_bottom'] + per_core = (operatorRepresentation['dim_kernel_y'] * + (operatorRepresentation['dim_im_in_y'] + pad_top + pad_bot) + + operatorRepresentation['dim_kernel_y']) + im2col_dim = 8 * per_core + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + return [(im2col_name, im2col_dim)] + PULPConv2D_8_Template = PULP2DConvTemplate(""" // PULP NN CONV diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py index bfa893db94..074528e829 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py @@ -14,9 +14,8 @@ class PULP2DFloatConvIm2ColTemplate(NodeTemplate): def __init__(self, templateStr): super().__init__(templateStr) - @staticmethod def computeTransientBuffersSize( - ctxt: NetworkContext, + self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. im2col_dim = (operatorRepresentation["weight_type"].typeWidth // @@ -29,8 +28,7 @@ def computeTransientBuffersSize( def hoistTransientBuffers(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: - im2col_name, im2col_dim = PULP2DFloatConvIm2ColTemplate.computeTransientBuffersSize( - ctxt, operatorRepresentation)[0] + im2col_name, im2col_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0] ctxt.hoistTransientBuffer(im2col_name, im2col_dim) operatorRepresentation['ctxtBuffer'] = im2col_name @@ -43,9 +41,8 @@ class PULP2DFloatDWConvIm2ColTemplate(NodeTemplate): def __init__(self, templateStr): super().__init__(templateStr) - @staticmethod def computeTransientBuffersSize( - ctxt: NetworkContext, + self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. @@ -58,8 +55,7 @@ def computeTransientBuffersSize( def hoistTransientBuffers(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: - im2col_name, im2col_dim = PULP2DFloatDWConvIm2ColTemplate.computeTransientBuffersSize( - ctxt, operatorRepresentation)[0] + im2col_name, im2col_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0] ctxt.hoistTransientBuffer(im2col_name, im2col_dim) # Manually set the type of the im2col buffer to match the input type, since it defaults to void for transient buffers diff --git a/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py b/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py index af3a93a185..be4a7bfea8 100644 --- a/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py @@ -11,9 +11,8 @@ class PULPiSoftmaxTemplate(NodeTemplate): - @staticmethod def computeTransientBuffersSize( - ctxt: NetworkContext, + self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: lastDimBuffer_dim = 8 * 4 * operatorRepresentation['lastDimLength'] @@ -22,8 +21,7 @@ def computeTransientBuffersSize( def hoistTransientBuffers(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: - lastDimBuffer_name, lastDimBuffer_dim = PULPiSoftmaxTemplate.computeTransientBuffersSize( - ctxt, operatorRepresentation)[0] + lastDimBuffer_name, lastDimBuffer_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0] ctxt.hoistTransientBuffer(lastDimBuffer_name, lastDimBuffer_dim) operatorRepresentation['lastDimBuffer'] = lastDimBuffer_name diff --git a/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/activations.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/activations.npz new file mode 100644 index 0000000000..dd0c847eee Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/activations.npz differ diff --git a/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/inputs.npz new file mode 100644 index 0000000000..0974db7323 Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/network.onnx new file mode 100644 index 0000000000..b61b59fe8c Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/outputs.npz new file mode 100644 index 0000000000..cd3eb1e4af Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ_8x16x16/outputs.npz differ diff --git a/DeeployTest/test_gap9_tiled_config.py b/DeeployTest/test_gap9_tiled_config.py index 764d61f0ca..6e8c3a5221 100644 --- a/DeeployTest/test_gap9_tiled_config.py +++ b/DeeployTest/test_gap9_tiled_config.py @@ -13,7 +13,8 @@ "Kernels/Integer/MatMul/Regular": [64000, 32000, 16000], "Kernels/Integer/Conv/Regular_2D_RQ": [8000, 6000, 4000], "Kernels/Integer/Conv/StriddedPadded_2D_RQ": [600], - "Kernels/Integer/Conv/DW_2D_RQ": [2561], + "Kernels/Integer/Conv/DW_2D_RQ": [64000, 2561], + "Kernels/Integer/Conv/DW_2D_RQ_8x16x16": [64000], "Kernels/Integer/Softmax/Regular": [800, 500, 300], "Kernels/Integer/Concat": [32000, 16000, 8000], "Kernels/Integer/Hardswish/Regular": [750], @@ -39,6 +40,7 @@ "Kernels/Integer/MatMul/Regular": [64000, 32000, 16000], "Kernels/Integer/Conv/Regular_2D_RQ": [8000, 6000, 5000], "Kernels/Integer/Conv/DW_2D_RQ": [5121], + "Kernels/Integer/Conv/DW_2D_RQ_8x16x16": [64000], "Kernels/Integer/Softmax/Regular": [1600, 1000, 600], "Kernels/Integer/Concat": [64000, 32000, 16000], "Kernels/Integer/Hardswish/Regular": [750], @@ -64,6 +66,7 @@ "Models/MLPerf/KeywordSpotting": [64000], "Models/MLPerf/ImageClassification": [64000], "Models/MLPerf/AnomalyDetection": [64000], + "Models/MLPerf/VisualWakeWords": [64000], } L2_DOUBLEBUFFER_MODELS = { @@ -72,6 +75,7 @@ "Models/MLPerf/KeywordSpotting": [64000], "Models/MLPerf/ImageClassification": [64000], "Models/MLPerf/AnomalyDetection": [64000], + "Models/MLPerf/VisualWakeWords": [64000], } L3_SINGLEBUFFER_MODELS = { diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index a687d9a489..6c5cdd66fd 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -112,6 +112,7 @@ "Models/MLPerf/KeywordSpotting": [64000], "Models/MLPerf/ImageClassification": [64000], "Models/MLPerf/AnomalyDetection": [64000], + "Models/MLPerf/VisualWakeWords": [64000], "Models/CCT/FP32/CCT_1_16_16_8": [64000], "Models/TinyViT/Demo": [4000], } @@ -127,6 +128,7 @@ "Models/MLPerf/KeywordSpotting": [128000], "Models/MLPerf/ImageClassification": [128000], "Models/MLPerf/AnomalyDetection": [128000], + "Models/MLPerf/VisualWakeWords": [128000], "Models/CCT/FP32/CCT_1_16_16_8": [128000], "Models/TinyViT/Demo": [8000], } diff --git a/TargetLibraries/GAP9/CMakeLists.txt b/TargetLibraries/GAP9/CMakeLists.txt index ca4c3ffbeb..4c25160540 100644 --- a/TargetLibraries/GAP9/CMakeLists.txt +++ b/TargetLibraries/GAP9/CMakeLists.txt @@ -23,6 +23,8 @@ target_compile_options(deeploygap9 PUBLIC ) target_compile_options(deeploygap9 PRIVATE + -Wno-incompatible-pointer-types + -Wno-implicit-function-declaration -Wno-sign-conversion -Wno-sign-compare -Wno-type-limits diff --git a/TargetLibraries/GAP9/prebuilt/libpulp-nn-mixed.a b/TargetLibraries/GAP9/prebuilt/libpulp-nn-mixed.a index d63369b6c9..07aef46d1e 100644 Binary files a/TargetLibraries/GAP9/prebuilt/libpulp-nn-mixed.a and b/TargetLibraries/GAP9/prebuilt/libpulp-nn-mixed.a differ diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index 1a510c945b..78698bbfea 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -62,7 +62,8 @@ if (platform IN_LIST PULP_NNX_PLATFORMS) -Wno-sign-conversion -Wno-typedef-redefinition -Wno-unused-parameter - -Wno-incompatible-pointer-types-discards-qualifiers + -Wno-incompatible-pointer-types + -Wno-discards-qualifiers ) target_link_libraries(deeploypulp INTERFACE pulp-nnx) endif() diff --git a/TargetLibraries/third_party/pulp-nn-mixed b/TargetLibraries/third_party/pulp-nn-mixed index faed38c72b..1d8eeee08a 160000 --- a/TargetLibraries/third_party/pulp-nn-mixed +++ b/TargetLibraries/third_party/pulp-nn-mixed @@ -1 +1 @@ -Subproject commit faed38c72b029b69dcab98571d228a66c3263891 +Subproject commit 1d8eeee08a2c32ec3850a0f2a1daa6551478599d