Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@

class iSoftmaxPreAllocatedBuffTemplate(NodeTemplate):

@staticmethod
def computeTransientBuffersSize(
ctxt: NetworkContext,
self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:

lastDimBuffer_dim = 8 * 4 * operatorRepresentation['lastDimLength']
Expand All @@ -22,8 +21,7 @@ def computeTransientBuffersSize(

def hoistTransientBuffers(self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
lastDimBuffer_name, lastDimBuffer_dim = iSoftmaxPreAllocatedBuffTemplate.computeTransientBuffersSize(
ctxt, operatorRepresentation)[0]
lastDimBuffer_name, lastDimBuffer_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
ctxt.hoistTransientBuffer(lastDimBuffer_name, lastDimBuffer_dim)

operatorRepresentation['lastDimBuffer'] = lastDimBuffer_name
Expand Down
40 changes: 34 additions & 6 deletions Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,8 @@ def alignToContext(self, ctxt: NetworkContext,

return ctxt, operatorRepresentation, []

@staticmethod
def computeTransientBuffersSize(
ctxt: NetworkContext,
self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
im2col_dim = 2 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] *
operatorRepresentation['dim_kernel_y'])
Expand All @@ -37,7 +36,7 @@ def computeTransientBuffersSize(

def hoistTransientBuffers(self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
im2col_name, im2col_dim = PULP2DConvTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
im2col_name, im2col_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
ctxt.hoistTransientBuffer(im2col_name, im2col_dim)

operatorRepresentation['ctxtBuffer'] = im2col_name
Expand All @@ -62,6 +61,22 @@ def alignToContext(self, ctxt: NetworkContext,

return ctxt, operatorRepresentation, []

def computeTransientBuffersSize(
self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
# The depthwise pulp-nn kernel reuses one column-shaped im2col scratch
# per core. Per core it needs `dim_kernel_x * (dim_in_y + pad_top +
# pad_bot) + dim_kernel_x` bytes (the trailing `+ dim_kernel_x` is the
# safety zone for the last v4u write).
pad_top = operatorRepresentation['padding_y_top']
pad_bot = operatorRepresentation['padding_y_bottom']
per_core = (operatorRepresentation['dim_kernel_x'] *
(operatorRepresentation['dim_im_in_y'] + pad_top + pad_bot) +
operatorRepresentation['dim_kernel_x'])
im2col_dim = 8 * per_core
im2col_name = operatorRepresentation['nodeName'] + "_buffer"
return [(im2col_name, im2col_dim)]


class PULP1DConvTemplate(NodeTemplate):

Expand All @@ -84,17 +99,16 @@ def alignToContext(self, ctxt: NetworkContext,

return ctxt, operatorRepresentation, []

@staticmethod
def computeTransientBuffersSize(
ctxt: NetworkContext,
self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
im2col_dim = 8 * 2 * operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_y']
im2col_name = operatorRepresentation['nodeName'] + "_buffer"
return [(im2col_name, im2col_dim)]

def hoistTransientBuffers(self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
im2col_name, im2col_dim = PULP1DConvTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
im2col_name, im2col_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
operatorRepresentation['ctxtBuffer'] = im2col_name
operatorRepresentation['ctxtBufferSize'] = im2col_dim
Expand All @@ -106,6 +120,20 @@ class PULP1DDWConvTemplate(PULP1DConvTemplate):
def __init__(self, templateStr):
super().__init__(templateStr)

def computeTransientBuffersSize(
self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
# The depthwise pulp-nn kernel reuses one column-shaped im2col scratch
# per core. Per core it needs `dim_kernel_y * (dim_in_y + pad_top + pad_bot) + dim_kernel_y` bytes (the trailing `+ dim_kernel_y` is the safety zone for the last v4u write).
pad_top = operatorRepresentation['padding_y_top']
pad_bot = operatorRepresentation['padding_y_bottom']
per_core = (operatorRepresentation['dim_kernel_y'] *
(operatorRepresentation['dim_im_in_y'] + pad_top + pad_bot) +
operatorRepresentation['dim_kernel_y'])
im2col_dim = 8 * per_core
im2col_name = operatorRepresentation['nodeName'] + "_buffer"
return [(im2col_name, im2col_dim)]


PULPConv2D_8_Template = PULP2DConvTemplate("""
// PULP NN CONV
Expand Down
12 changes: 4 additions & 8 deletions Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@ class PULP2DFloatConvIm2ColTemplate(NodeTemplate):
def __init__(self, templateStr):
super().__init__(templateStr)

@staticmethod
def computeTransientBuffersSize(
ctxt: NetworkContext,
self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
# Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
im2col_dim = (operatorRepresentation["weight_type"].typeWidth //
Expand All @@ -29,8 +28,7 @@ def computeTransientBuffersSize(

def hoistTransientBuffers(self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
im2col_name, im2col_dim = PULP2DFloatConvIm2ColTemplate.computeTransientBuffersSize(
ctxt, operatorRepresentation)[0]
im2col_name, im2col_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
ctxt.hoistTransientBuffer(im2col_name, im2col_dim)

operatorRepresentation['ctxtBuffer'] = im2col_name
Expand All @@ -43,9 +41,8 @@ class PULP2DFloatDWConvIm2ColTemplate(NodeTemplate):
def __init__(self, templateStr):
super().__init__(templateStr)

@staticmethod
def computeTransientBuffersSize(
ctxt: NetworkContext,
self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:

# Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
Expand All @@ -58,8 +55,7 @@ def computeTransientBuffersSize(

def hoistTransientBuffers(self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
im2col_name, im2col_dim = PULP2DFloatDWConvIm2ColTemplate.computeTransientBuffersSize(
ctxt, operatorRepresentation)[0]
im2col_name, im2col_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
ctxt.hoistTransientBuffer(im2col_name, im2col_dim)

# Manually set the type of the im2col buffer to match the input type, since it defaults to void for transient buffers
Expand Down
6 changes: 2 additions & 4 deletions Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@

class PULPiSoftmaxTemplate(NodeTemplate):

@staticmethod
def computeTransientBuffersSize(
ctxt: NetworkContext,
self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:

lastDimBuffer_dim = 8 * 4 * operatorRepresentation['lastDimLength']
Expand All @@ -22,8 +21,7 @@ def computeTransientBuffersSize(

def hoistTransientBuffers(self, ctxt: NetworkContext,
operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
lastDimBuffer_name, lastDimBuffer_dim = PULPiSoftmaxTemplate.computeTransientBuffersSize(
ctxt, operatorRepresentation)[0]
lastDimBuffer_name, lastDimBuffer_dim = self.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
ctxt.hoistTransientBuffer(lastDimBuffer_name, lastDimBuffer_dim)

operatorRepresentation['lastDimBuffer'] = lastDimBuffer_name
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
6 changes: 5 additions & 1 deletion DeeployTest/test_gap9_tiled_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
"Kernels/Integer/MatMul/Regular": [64000, 32000, 16000],
"Kernels/Integer/Conv/Regular_2D_RQ": [8000, 6000, 4000],
"Kernels/Integer/Conv/StriddedPadded_2D_RQ": [600],
"Kernels/Integer/Conv/DW_2D_RQ": [2561],
"Kernels/Integer/Conv/DW_2D_RQ": [64000, 2561],
"Kernels/Integer/Conv/DW_2D_RQ_8x16x16": [64000],
"Kernels/Integer/Softmax/Regular": [800, 500, 300],
"Kernels/Integer/Concat": [32000, 16000, 8000],
"Kernels/Integer/Hardswish/Regular": [750],
Expand All @@ -39,6 +40,7 @@
"Kernels/Integer/MatMul/Regular": [64000, 32000, 16000],
"Kernels/Integer/Conv/Regular_2D_RQ": [8000, 6000, 5000],
"Kernels/Integer/Conv/DW_2D_RQ": [5121],
"Kernels/Integer/Conv/DW_2D_RQ_8x16x16": [64000],
"Kernels/Integer/Softmax/Regular": [1600, 1000, 600],
"Kernels/Integer/Concat": [64000, 32000, 16000],
"Kernels/Integer/Hardswish/Regular": [750],
Expand All @@ -64,6 +66,7 @@
"Models/MLPerf/KeywordSpotting": [64000],
"Models/MLPerf/ImageClassification": [64000],
"Models/MLPerf/AnomalyDetection": [64000],
"Models/MLPerf/VisualWakeWords": [64000],
}

L2_DOUBLEBUFFER_MODELS = {
Expand All @@ -72,6 +75,7 @@
"Models/MLPerf/KeywordSpotting": [64000],
"Models/MLPerf/ImageClassification": [64000],
"Models/MLPerf/AnomalyDetection": [64000],
"Models/MLPerf/VisualWakeWords": [64000],
}

L3_SINGLEBUFFER_MODELS = {
Expand Down
2 changes: 2 additions & 0 deletions DeeployTest/test_siracusa_tiled_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@
"Models/MLPerf/KeywordSpotting": [64000],
"Models/MLPerf/ImageClassification": [64000],
"Models/MLPerf/AnomalyDetection": [64000],
"Models/MLPerf/VisualWakeWords": [64000],
"Models/CCT/FP32/CCT_1_16_16_8": [64000],
"Models/TinyViT/Demo": [4000],
}
Expand All @@ -127,6 +128,7 @@
"Models/MLPerf/KeywordSpotting": [128000],
"Models/MLPerf/ImageClassification": [128000],
"Models/MLPerf/AnomalyDetection": [128000],
"Models/MLPerf/VisualWakeWords": [128000],
"Models/CCT/FP32/CCT_1_16_16_8": [128000],
"Models/TinyViT/Demo": [8000],
}
Expand Down
2 changes: 2 additions & 0 deletions TargetLibraries/GAP9/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ target_compile_options(deeploygap9 PUBLIC
)

target_compile_options(deeploygap9 PRIVATE
-Wno-incompatible-pointer-types
-Wno-implicit-function-declaration
-Wno-sign-conversion
-Wno-sign-compare
-Wno-type-limits
Expand Down
Binary file modified TargetLibraries/GAP9/prebuilt/libpulp-nn-mixed.a
Binary file not shown.
3 changes: 2 additions & 1 deletion TargetLibraries/PULPOpen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ if (platform IN_LIST PULP_NNX_PLATFORMS)
-Wno-sign-conversion
-Wno-typedef-redefinition
-Wno-unused-parameter
-Wno-incompatible-pointer-types-discards-qualifiers
-Wno-incompatible-pointer-types
-Wno-discards-qualifiers
)
target_link_libraries(deeploypulp INTERFACE pulp-nnx)
endif()
2 changes: 1 addition & 1 deletion TargetLibraries/third_party/pulp-nn-mixed
Submodule pulp-nn-mixed updated 74 files
+2 −1 .gitignore
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i2_i8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i2_i8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i2_i8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i2_u8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i2_u8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i2_u8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i4_i8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i4_i8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i4_i8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i4_u8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i4_u8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i4_u8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i8_i8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i8_i8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i8_i8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i8_u8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i8_u8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_i8_u8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u2_i8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u2_i8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u2_i8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u2_u8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u2_u8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u2_u8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u4_i8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u4_i8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u4_i8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u4_u8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u4_u8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u4_u8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u8_i8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u8_i8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u8_i8_i8.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u8_u8_i2.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u8_u8_i4.c
+3 −3 XpulpV2/32bit/src/LinearQuant/pulp_nn_linear_u8_u8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i2_i8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i2_i8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i2_i8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i2_u8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i2_u8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i2_u8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i4_i8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i4_i8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i4_i8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i4_u8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i4_u8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i4_u8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i8_i8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i8_i8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i8_i8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i8_u8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i8_u8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_i8_u8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u2_i8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u2_i8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u2_i8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u2_u8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u2_u8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u2_u8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u4_i8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u4_i8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u4_i8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u4_u8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u4_u8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u4_u8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u8_i8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u8_i8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u8_i8_i8.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u8_u8_i2.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u8_u8_i4.c
+3 −3 XpulpV2/64bit/src/LinearQuant/pulp_nn_linear_u8_u8_i8.c
+3 −3 generators/templates/pulp_nn_linear_q_x_y_z.t
Loading