1. It is open source and royalty free
2. Got decent support for GPU/CPU
3. Scaling efficiently to multiple GPU and machines
4. Support cpp api, which means you do not need to ask the users to install python environment , shipped the source codes in order to run your apps
5. mxnet support many platforms, including windows, linux, mac, aws, android, ios
6. It got a lot of pre-trained models
7. MMDNN support mxnet, which mean we can convert the models trained by different libraries to mxnet(although not all of the models could be converted).
Step 1 : Download model and convert it to the format can load by cpp package
1. Install anaconda(the version come with python3)2. Install mxnet from the terminal of anaconda
3. Install gluon-cv from the terminal of anaconda
4. Download model and convert it by following scripts
import gluoncv as gcv from gluoncv.utils import export_block net = gcv.model_zoo.get_model('yolo3_darknet53_coco', pretrained=True) export_block('yolo3_darknet53_coco', net)
Step 2 : Load the models after convert
void load_check_point(std::string const &model_params, std::string const &model_symbol, Symbol *symbol, std::map<std::string, NDArray> *arg_params, std::map<std::string, NDArray> *aux_params, Context const &ctx) { Symbol new_symbol = Symbol::Load(model_symbol); std::map<std::string, NDArray> params = NDArray::LoadToMap(model_params); std::map<std::string, NDArray> args; std::map<std::string, NDArray> auxs; for (auto iter : params) { std::string type = iter.first.substr(0, 4); std::string name = iter.first.substr(4); if (type == "arg:") args[name] = iter.second.Copy(ctx); else if (type == "aux:") auxs[name] = iter.second.Copy(ctx); else continue; } *symbol = new_symbol; *arg_params = args; *aux_params = auxs; }
You could use the load_check_point function as following
Symbol net; std::map<std::string, NDArray> args, auxs; load_check_point(model_params, model_symbols, &net, &args, &auxs, context); //The shape of the input data must be the same, if you need different size, //you could rebind the Executor or create a pool of Executor. //In order to create input layer of the Executor, I make a dummy NDArray. //The value of the "data" could be change later args["data"] = NDArray(Shape(1, static_cast<unsigned>(input_size.height), static_cast<unsigned>(input_size.width), 3), context); executor_.reset(net.SimpleBind(context, args, std::map<std::string, NDArray>(), std::map<std::string, OpReqType>(), auxs));
model_params is the location of the weights(ex : yolo3_darknet53_coco.params), model_symbols(ex : yolo3_darknet53_coco.json) is the location of the symbols saved as json.
Step 3: Convert image format
Before we feed the image into the executor of mxnet, we need to convert them.NDArray cvmat_to_ndarray(cv::Mat const &bgr_image, Context const &ctx) { cv::Mat rgb_image; cv::cvtColor(bgr_image, rgb_image, cv::COLOR_BGR2RGB); rgb_image.convertTo(rgb_image, CV_32FC3);
//This api copy the data of rgb_image into NDArray. As far as I know,
//opencv guarantee continuous of cv::Mat unless it is sub matrix of cv::Mat return NDArray(rgb_image.ptr<float>(), Shape(1, static_cast<unsigned>(rgb_image.rows), static_cast<unsigned>(rgb_image.cols), 3), ctx); }
Step 4 : Perform object detection on video
void object_detector::forward(const cv::Mat &input) { //By default, input_size_.height equal to 256 input_size_.width equal to 320. //Yolo v3 has a limitation, width and height of the image must be divided by 32. if(input.rows != input_size_.height || input.cols != input_size_.width){ cv::resize(input, resize_img_, input_size_); }else{ resize_img_ = input; } auto data = cvmat_to_ndarray(resize_img_, *context_); //Copy the data of the image to the "data" data.CopyTo(&executor_->arg_dict()["data"]); //Forward is an async api. executor_->Forward(false); }
Step 5 : Draw bounding boxes on image
void plot_object_detector_bboxes::plot(cv::Mat &inout, std::vector<mxnet::cpp::NDArray> const &predict_results, bool normalize) { using namespace mxnet::cpp; //1. predict_results get from the output of Executor(executor_->outputs) //2. Must Set Context as cpu because we need process data by cpu later auto labels = predict_results[0].Copy(Context(kCPU, 0)); auto scores = predict_results[1].Copy(Context(kCPU, 0)); auto bboxes = predict_results[2].Copy(Context(kCPU, 0)); //1. Should call wait because Forward api of Executor is async //2. scores and labels could treat as one dimension array //3. BBoxes can treat as 2 dimensions array bboxes.WaitToRead(); scores.WaitToRead(); labels.WaitToRead(); size_t const num = bboxes.GetShape()[1]; for(size_t i = 0; i < num; ++i) { float const score = scores.At(0, 0, i); if (score < thresh_) break; size_t const cls_id = static_cast<size_t>(labels.At(0, 0, i)); auto const color = colors_[cls_id]; //pt1 : top left; pt2 : bottom right cv::Point pt1, pt2; //get_points perform normalization std::tie(pt1, pt2) = normalize_points(bboxes.At(0, i, 0), bboxes.At(0, i, 1), bboxes.At(0, i, 2), bboxes.At(0, i, 3), normalize, cv::Size(inout.cols, inout.rows)); cv::rectangle(inout, pt1, pt2, color, 2); std::string txt; if (labels_.size() > cls_id) { txt += labels_[cls_id]; } std::stringstream ss; ss << std::fixed << std::setprecision(3) << score; txt += " " + ss.str(); put_label(inout, txt, pt1, color); } }
I only mentioned the key points in this post, if you want to study the details, please check it on github.