namespace kuiper_infer { InferStatus SigmoidLayer::Forward( const std::vector<std::shared_ptr<Tensor<float>>> &inputs, std::vector<std::shared_ptr<Tensor<float>>> &outputs){ if (inputs.empty()) { LOG(ERROR) << "The input tensor array in the relu layer is empty"; return InferStatus::kInferFailedInputEmpty; } if (inputs.size() != outputs.size()) { LOG(ERROR) << "The input and output tensor array size of the relu layer do " "not match"; return InferStatus::kInferFailedInputOutSizeMatchError; }
constuint32_t batch_size = inputs.size(); for (uint32_t i = 0; i < batch_size; ++i) { const sftensor &input_data = inputs.at(i); const sftensor &output_data = outputs.at(i); if (input_data == nullptr || input_data->empty()) { LOG(ERROR) << "The input tensor array in the relu layer has an empty tensor " << i << " th"; return InferStatus::kInferFailedInputEmpty; } if (output_data != nullptr && !output_data->empty()) { if (input_data->shapes() != output_data->shapes()) { LOG(ERROR) << "The input and output tensor shapes of the relu " "layer do not match " << i << " th"; return InferStatus::kInferFailedInputOutSizeMatchError; } } }
for (uint32_t i = 0; i < batch_size; ++i) { const std::shared_ptr<Tensor<float>> &input = inputs.at(i); CHECK(input == nullptr || !input->empty()) << "The input tensor array in the relu layer has an empty tensor " << i << " th";
std::shared_ptr<Tensor<float>> output = outputs.at(i); if (output == nullptr || output->empty()) { DLOG(ERROR) << "The output tensor array in the relu layer has an empty tensor " << i << " th"; output = std::make_shared<Tensor<float>>(input->shapes()); outputs.at(i) = output; } CHECK(output->shapes() == input->shapes()) << "The input and output tensor shapes of the relu layer do not match " << i << " th"; for (uint32_t j = 0; j < input->size(); ++j) { float value = input->index(j); output->index(j) = 1.f / (1.f + expf(-value)); } } return InferStatus::kInferSuccess; }
// expression.cpp InferStatus ExpressionLayer::Forward( const std::vector<std::shared_ptr<Tensor<float>>>& inputs, std::vector<std::shared_ptr<Tensor<float>>>& outputs){ if (inputs.empty()) { LOG(ERROR) << "The input tensor array in the expression layer is empty"; return InferStatus::kInferFailedInputEmpty; }
if (outputs.empty()) { LOG(ERROR) << "The output tensor array in the expression layer is empty"; return InferStatus::kInferFailedOutputEmpty; }
CHECK(this->parser_ != nullptr) << "The parser in the expression layer is null!"; this->parser_->Tokenizer(false); constauto& expressions = this->parser_->tokens(); CHECK(!expressions.empty()) << "The expression parser failed to parse " << statement_;
for (uint32_t i = 0; i < inputs.size(); ++i) { const sftensor& input_data = inputs.at(i); if (input_data == nullptr || input_data->empty()) { LOG(ERROR) << "The input tensor array in the expression layer has an " "empty tensor " << i << "th"; return InferStatus::kInferFailedInputEmpty; } }
constuint32_t batch_size = outputs.size(); for (uint32_t i = 0; i < batch_size; ++i) { if (outputs.at(i) == nullptr || outputs.at(i)->empty()) { DLOG(ERROR) << "The output tensor array in the expression layer has an " "empty tensor " << i << "th"; return InferStatus::kInferFailedOutputEmpty; } outputs.at(i)->Fill(0.f); }
std::stack<std::vector<std::shared_ptr<Tensor<float>>>> op_stack; const std::vector<std::shared_ptr<TokenNode>>& token_nodes = this->parser_->Generate(); for (constauto& token_node : token_nodes) { if (token_node->num_index >= 0) { // process operator uint32_t start_pos = token_node->num_index * batch_size; std::vector<std::shared_ptr<Tensor<float>>> input_token_nodes; for (uint32_t i = 0; i < batch_size; ++i) { CHECK(i + start_pos < inputs.size()) << "The " << i << "th operand doesn't have appropriate number of tensors"; // fixme 这里的张量拷贝是否有必要 input_token_nodes.push_back(inputs.at(i + start_pos)); } op_stack.push(input_token_nodes); } else { // process operation constint32_t op = token_node->num_index; if (op != int(TokenType::TokenAdd) && op != int(TokenType::TokenMul) && op != int(TokenType::TokenSin)) { LOG(FATAL) << "Unknown operator type: " << op; } if (op == int(TokenType::TokenSin)) { CHECK(op_stack.size() >= 1) << "The number of operand is less than one for sin operation"; std::vector<std::shared_ptr<Tensor<float>>> input_node = op_stack.top(); CHECK(input_node.size() == batch_size) << "The operand doesn't have appropriate number of tensors, " "which need " << batch_size; op_stack.pop(); std::vector<std::shared_ptr<Tensor<float>>> output_token_nodes(batch_size); for (uint32_t i = 0; i < batch_size; ++i) { // do execution output_token_nodes.at(i) = TensorElementSin(input_node.at(i)); // Modified } op_stack.push(output_token_nodes); continue; /// 跳过循环的其余部分进行sin操作 } else { CHECK(op_stack.size() >= 2) << "The number of operand is less than two"; std::vector<std::shared_ptr<Tensor<float>>> input_node1 = op_stack.top();
CHECK(input_node1.size() == batch_size) << "The first operand doesn't have appropriate number of tensors, " "which need " << batch_size; op_stack.pop();
std::vector<std::shared_ptr<Tensor<float>>> input_node2 = op_stack.top(); CHECK(input_node2.size() == batch_size) << "The second operand doesn't have appropriate number of tensors, " "which need " << batch_size; op_stack.pop();
Training LLMs involves instruction tuning, reinforcement learning and etc, which are difficult to replicate during QAT
Method:
Data-free quantization-aware training (QAT) which produces QAT data using next token data generation -> Select appropriate fine-tuning dataset
Per-channel weight quantization and per-token activation quantization (symmetric MinMax quantization), per-token quantization for KV cache -> Identify suitable quantizer
Cross-entropy based loss -> Knowledge distillation from full precision model
Result:
Empirical recommendations:
8-bit quantization should be preferred over smaller full precision models, and PTQ methods are sufficient for this case
4-bit models quantized using LLM-QAT should be preferred over 8-bit models of similar size -> 4-bit LLM-QAT models towards the best efficiency-accuracy tradeoff
Partial results:
Limitation:
4-bit quantization does not have hardware support out-of-the-box -> no hardware implementation
Method works well for 4-bit weights, 4-bit KV cache and 8-bit activations -> Insufficient for 4-bit activation quantization
Reduce memory footprint of parameter-efficient fine-tuning(PEFT) stage
Method:
Overall pipeline
QLoRA
4-bit NormalFloat Quantization -> better quantization data type for normally distributed data compared with 4-bit Integers and 4-bit Floats (See the paper for details)
Double Quantization -> combined with NF4 to reduce the memory footprint of quantization constants i.e. weights (See the paper for details)
Paged Optimizers -> manage memory spikes i.e. manage the memory swap between CPU and GPU
Result:
MMLU test accuracy
Memory footprint -> enables the finetuning of 33B parameter models on a single consumer GPU and 65B parameter models on a single professional GPU, even 7B parameter models on mobile phones(e.g. iPhone 12 Plus)
Limitation:
Can’t establish that QLoRA can match full 16-bit finetuning performance at 33B and 65B scales…
Did not evaluate different bit-precisions e.g.3-bit base models, or different adapter methods
BasicVSR++: Improving Video Super-Resolution with Enhanced Propagation and Alignment (CVPR 2022) -> BasicVSR++
Learning Trajectory-Aware Transformer for Video Super-Resolution (CVPR 2022) -> TTVSR
An Implicit Alignment for Video Super-Resolution (CVPR2023) -> IA-RT/IA-CNN
Rethinking Alignment in Video Super-Resolution Transformers (NIPS2022) -> PSRT
ResQ: Residual Quantization for Video Perception (ICCV 2023) -> ResQ
motivation: residuals exhibit a significantly lower variance than the frame activations, and can be quantized with lower error.
verified tasks: Human Pose Estimation/Semantic Segmentation
limitations:
requires the propagation of representations to future timesteps, leading to a memory overhead potentially impacting latency -> 对VSR任务影响小,例如BasicVSR++ 本身就是基于帧间传播的,且目前VSR对latency要求不高
implementing location-specific quantized operations is not trivial and requires specialized hardware or gather-scatter implementations of convolutions -> 实际部署困难问题 特定区域的量化选择 涉及稀疏处理的调度问题
ResQ is able to reduce the amortized cost of video processing, yet the peak BOPs is not reduced
basic: add_argument(name or flags, ...) -> 添加命令行参数(xxx: positional argument or --xxx: option that takes a value)。name or flags 参数可以是单个选项(例如 ‘-f’),也可以是多个选项(例如 ‘-f’, ‘–file’)。你可以使用许多其他关键字参数来配置参数的行为,如 type、default、help 等,示例如下
while read NAME: 这部分创建一个 while 循环,它将逐行读取管道传入的文件路径,并将每行内容赋值给 NAME 变量
do: 这标志着 while 循环的开始
mkdir -p "${NAME%.tar}": 这是在循环中的第一个命令。它使用 mkdir 命令创建目录,并且 -p no error if existing, make parent directories as needed. ${NAME%.tar} 是一种变量扩展,它会从 NAME 变量的值中删除 .tar扩展名,然后创建一个对应的目录
tar -xvf "${NAME}" -C "${NAME%.tar}": 这是在循环中的第二个命令。它使用 tar 命令来解压缩 NAME变量中指定的 .tar文件,并将解压后的文件放入对应的目录 ${NAME%.tar}, note: 参数说明 -C, --directory=DIR change to directory DIR 用于指定解压缩操作的目标目录
HR-WSI: Structure-Guided Ranking Loss for Single Image Depth Prediction
Holopix50k: A Large-Scale In-the-wild Stereo Image Dataset
DiverseDepth: Affine-invariant Depth Prediction Using Diverse Data
ReDWeb V1: Monocular Relative Depth Perception with Web Stereo Data Supervision
The Replica Dataset: A Digital Replica of Indoor Spaces
Taskonomy: Disentangling Task Transfer Learning
Methods
authority recommend
ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth (arXiv 2023.02)
Vision Transformers for Dense Prediction (ICCV 2021)
Learning to Recover 3D Scene Shape from a Single Image (CVPR 2021)
lightweight SIDE research
Deep Neighbor Layer Aggregation for Lightweight Self-Supervised Monocular Depth Estimation (arXiv 2023.09)
fully convolutional depth estimation network using contextual feature fusion
use high-resolution and low-resolution features to reserve information on small targets and fast-moving objects instead of long-range fusion
employing lightweight channel attention based on convolution in the decoder stage
RT-MonoDepth: Real-time Monocular Depth Estimation on Embedded Systems (arXiv 2023.08)
Fast inference based on convolution: RT-MonoDepth and RT-MonoDepthS, runs at 18.4&30.5 FPS on NVIDIA Jetson Nano and 253.0&364.1 FPS on NVIDIA Jetson AGX Orin on a single RGB image of resolution 640×192, and achieve relative stateof-the-art accuracy on the KITTI dataset.
Encoder (downsample inputs): 4-layer pyramid convolution encoder, removing the normalization layer, standard convolutions instead of depth-wise separable convolution.
Decoder (upsample and fuse): upsampling -> 3 × 3 depth-wise separable convolution followed by nearest-neighbor interpolation with a scale factor of 2; fusion -> mixed use of element-wise addition and concatenate; prediction -> convs + activating functions: leakyReLU, sigmoid.
Lightweight Monocular Depth Estimation via Token-Sharing Transformer (2023 IEEE International Conference on Robotics and Automation (ICRA), CCF-B)
Token-Sharing Transformer (TST): On the NYU Depth v2 dataset, TST can deliver depth maps up to 63.4 FPS in NVIDIA Jetson nano and 142.6 FPS in NVIDIA Jetson TX2.
Design concept: hierarchy-focused architecture (gradually reduces the resolutions of tokens) + bottleneck-focused architecture (bottleneck-focused architecture reduces the resolution through CNN and applies self-attention only in low-resolution tokens)
Lite-Mono: A Lightweight CNN and Transformer Architecture for Self-Supervised Monocular Depth Estimation (CVPR 2023)
efficient combination of CNNs and Transformers: Consecutive Dilated Convolutions (CDC) module -> shallow CNNs with dilated convolution to enhance local features; Local-Global Features Interaction (LGFI) module -> cross-covariance attention to compute the attention along the feature channels.
Boosting LightWeight Depth Estimation via Knowledge Distillation (International Conference on Knowledge Science, Engineering and Management, KSEM 2023, CCF-C)
lightweight network (MobileNet-v2 Encoder, Channel-wise attention) + Promoting KD with Auxiliary Data
Lightweight Monocular Depth Estimation with an Edge Guided Network (2022 17th International Conference on Control, Automation, Robotics and Vision, ICARCV, CORE Computer Science Conference Rankings: A)
Preliminary: edge information are important cues for convolutional neural networks (CNNs) to estimate depth.
Encoder-Decoder Architecture:
Multi-scale Feature Extractor -> MobileNetV2 as the backbone
Edge Guidance Branch -> guiding depth estimation
Transformer-Based Feature Aggregation Module
Lightweight Monocular Depth Estimation through Guided Decoding (2022 International Conference on Robotics and Automation (ICRA), CCF-B)
lightweight encoder-decoder architecture for embedded platforms + Guided Upsampling Block
inference:
NYU Depth V2: 35.1 fps on the NVIDIA Jetson Nano and up to 144.5 fps on the NVIDIA Xavier NX
KITTI: 23.7 fps on the Jetson Nano and 102.9 fps on the Xavier NX
MobileXNet: An Efficient Convolutional Neural Network for Monocular Depth Estimation (IEEE Transactions on Intelligent Transportation Systems, 2022, CCF-B)
Video Super-Resolution Quantization (Time:2023.07.07-2023.08.07)
Paper Reading
Dynamic Network Quantization for Efficient Video Inference (ICCV2021)
Feat: selects optimal precision for each frame conditioned on the input for efficient video recognition
ResQ: Residual Quantization for Video Perception (ICCV2023)
Feat: difference in network activations between two neighboring frames, exhibit properties that make them highly quantizable
QuantSR: Accurate Low-bit Quantization for Efficient Image Super-Resolution (NIPS2023)
To overcome the representation homogeneity caused by quantization in the network, we introduce the Redistribution-driven Learnable Quantizer (RLQ). This is accomplished through an inference-agnostic efficient redistribution design, which adds additional information in both forward and backward passes to improve the representation ability of quantized networks. (为了克服网络中量化造成的表示同质性,我们引入了重分布驱动的可学习量化器 (RLQ)。这是通过与推理无关的高效重分布设计实现的,它在前向和后向传递中添加了额外信息,以提高量化网络的表示能力。)
Furthermore, to achieve flexible inference and break the upper limit of accuracy, we propose the Depth-dynamic Quantized Architecture (DQA). Our DQA allows for the trade-off between efficiency and accuracy during inference through weight sharing.(此外,为了实现灵活的推理并突破准确率的上限,我们提出了深度动态量化架构(DQA)。我们的DQA通过权重共享,实现了推理过程中效率和准确率之间的平衡。)
Knowledge Distillation for Optical Flow-Based Video Superresolution (JCSE2023)
Feat: Video super-resolution; Optical flow; Knowledge distillation;
EDVR: Video Restoration with Enhanced Deformable Convolutional Networks (NTIRE2019)
leverage temporal redundancies to accelerate video processing
Towards High Performance Video Object Detection for Mobiles (MSRA_arxiv2018)
Temporally Distributed Networks for Fast Video Semantic Segmentation (CVPR2020)