To get a better image of how to make use of the
artificial neural network provided by openCV, I decided to do develop a simple OCR(optical character recognition) engine by CvANN_MLP.
At first, I download the image from
stack overflow, they are
|
graph_00(digits for training) |
|
graph_01(digits for classify) |
Step 1 : segment the digits of graph_00
std::string const prefix("/Users/Qt/program/blogsCodes/");
cv::Mat img = cv::imread(prefix + "pic/digits00.png");
if(img.empty()){
std::cerr<<"can't read image"<<std::endl;
return - 1;
}
detectCharacter dc;
std::vector training_data = dc.segment(img, {10, 10});
implementation of segment
std::vector detectCharacter::segment(cv::Mat const &input,
cv::Size const &crop_size)
{
CV_Assert(input.type() == CV_8UC3);
//step 1 : binarize the image
cv::cvtColor(input, gray, CV_BGR2GRAY);
cv::GaussianBlur(gray, gray, cv::Size(5, 5), 0);
cv::Mat binary = gray;
cv::threshold(gray, binary, 0, 255,
cv::THRESH_BINARY_INV + cv::THRESH_OTSU);
//step 2 : find the contours
cv::findContours(binary.clone(), contours, CV_RETR_EXTERNAL,
CV_CHAIN_APPROX_SIMPLE);
//step 3 : get the bounding rect of each contours
for(auto &data : contours){
min_rects.emplace_back(cv::boundingRect(data));
}
//step 4 : crop each digits into the vector
std::vector<cv::Mat> digits;
for(size_t i = 0, size = contours.size(); i != size; ++i){
cv::Mat const digit = crop_digit(binary, min_rects[i],
contours[i], crop_size);
digits.emplace_back(digit);
}
return digits;
}
implementation of crop_digit
/**
* @brief crop a character in the input
* @param input : input image
* @param rect : location of the character
* @param points : contours
* @param size : size of the result
* @return character after crop
*/
cv::Mat detectCharacter::crop_digit(cv::Mat const &input,
cv::Rect const &rect,
std::vector<cv::Point> const &contour,
cv::Size const &size)
{
cv::Mat mask = cv::Mat(input, rect);
cv::Mat digit(mask.size(), mask.type());
digit.setTo(0);
auto digit_ptr = digit.ptr<uchar>(0);
for(int row = 0; row != mask.rows; ++row){
auto mask_ptr = mask.ptr<uchar>(row);
for(int col = 0; col != mask.cols; ++col){
//only take the pixels equal to 255 and
//lie in the region surrounded by the contour
if(*mask_ptr == 255 &&
cv::pointPolygonTest(contour,
{col + rect.x, row + rect.y},
false) >= 0){
*digit_ptr = 255;
}
++mask_ptr; ++digit_ptr;
}
}
cv::resize(digit, digit, size);
return digit;
}
Before I return the digit, I resize the digit into a specific size, because the samples of the ANN have to be
same size and
same type.
Step 2 : set up training labels
This step is very verbose, I have to map the number of the digits after segmentation to appropriate value.Since there are 125 digits after segmentation, it took me some times to make it done.
std::vector<int> const training_labels
{
7, 4, 8, 2, 1, 6, 4, 4, 3, 3, 9, 5, 6, 6, 5,
7, 9, 0, 1, 8, 8, 2, 4, 4, 6, 9, 1, 8, 3, 0,
3, 9, 4, 5, 9, 8, 4, 9, 2, 2, 6, 4, 4, 6, 9,
5, 5, 5, 0, 1, 1, 2, 5, 8, 3, 9, 1, 0, 7, 2,
0, 1, 4, 8, 2, 0, 5, 4, 7, 1, 1, 1, 8, 4, 8,
2, 1, 8, 0, 4, 9, 5, 3, 5, 2, 7, 1, 3, 2, 2,
8, 5, 0, 5, 5, 9, 0, 6, 4, 4, 8, 3, 9, 0, 7,
4, 6, 6, 0, 3, 2, 8, 2, 3, 1, 5, 6, 8, 0, 8,
4, 1, 2, 8, 9
};
Step 3 : write the training data and training labels into xml file
We could transform the data and labels to
suitable form without writing them to xml too, we don't have any obligation to write the data into xml, write it or not is depend on your needs.
void write_digits_xml(std::vector<cv::Mat> &training_data,
std::vector<int> const &training_labels)
{
cv::Mat train_data;
cv::Mat train_labels;
for(auto &data : training_data){
//same as data.reshape(1, 1),
//data.convertTo(data, CV_32F);
OCV::transform_to_svm_training_data(data);
train_data.push_back(data);
}
cv::Mat(training_labels).copyTo(train_labels);
cv::FileStorage fs("digits.xml", cv::FileStorage::WRITE);
fs << "TrainingData10x10" << train_data;
fs << "Labels" << train_labels;
}
Step 4 : Segment the digit of the graph_01(target)
cv::Mat target = cv::imread(prefix + "pic/digitTarget00.png");
if(target.empty()){
std::cerr<<"can't read target"<<std::endl;
return -1;
}
std::vector<cv::Mat> target_digits = dc.segment(target, {10, 10});
std::vector<cv::Mat> temp_target_digits; //for verification
for(auto &data : target_digits){
temp_target_digits.emplace_back(data.clone());
OCV::transform_to_svm_training_data(data);
}
At step 4, I segment the digits from graph_01 and create one more buffer--
temp_target_digits to store the digits after segmentation, because the
target_digits have to transform to
one row and
one channel matrix, so we need a buffer to retain the digit after segmentation for verification on the next step.
Step 5 : train and classify the digit
characterRecognizer cr(prefix +
"simpleOCR/build-exercise00-Qt5_1_1_clang_3_2-Release/digits.xml");
cr.train(10, 10);
for(size_t i = 0, size = target_digits.size(); i != size; ++i){
int const classify = cr.classify(target_digits[i]);
std::cout<<classify<<std::endl;
cv::imshow("", temp_target_digits[i]);
cv::waitKey();
}
At step 5, I train the neural network, classify the image after segmentation(we need to transform the image want to classify to
one row,
one channel and
CV_32F matrix), and verify the result correct or by comparing the digits after segmentation and the result of classify.
implementation of train
/**
* @brief overload of train, you can specity the training
* data and training labels by this overload function
* @param train_data : the training data
* @param labels : the training labels
* @param nlayers : layers of the network
* @param num_char : number of the characters,
* it is equal to the type of the training labels(training classes)
*/
void characterRecognizer::train(cv::Mat const &train_data,
cv::Mat const &labels, int nlayers,
int num_char)
{
CV_Assert(!train_data.empty() && !labels.empty());
num_charecter = num_char;
int buffer[] = {train_data.cols, nlayers, num_char};
cv::Mat const layers(1, 3, CV_32S, buffer);
ann.create(layers, CvANN_MLP::SIGMOID_SYM, 1, 1);
//Prepare trainClases
//Create a mat with n trained data by m classes
cv:: Mat train_classes;
train_classes.create(train_data.rows, num_char, CV_32F);
for( int i = 0; i != train_classes.rows; ++i){
int const label = *labels.ptr<int>(i);
auto train_ptr = train_classes.ptr<float>(i);
for(int k = 0; k != train_classes.cols; ++k){
*train_ptr = k != label ? 0 : 1;
++train_ptr;
}
}
cv::Mat const weights = cv::Mat::ones(1, train_data.rows, CV_32FC1);
//Learn classifier
ann.train(train_data, train_classes, weights);
}
The principle of this method could found at
this link. In this example I treat all of the region of the segmented digit as features, exactly this is not an optimal solution, there are many ways to extract the features of the segmented digit.
Although the hit rate is high, from the view of machine learning, this method is
far from perfect, because it may
overfitting the training set, there are many algorithms to help us verify that it is underfitting, overfitting and give us some hints about how to gain a better training result.
Don't jump to the conclusion that adding more training examples can improve the result, collect those examples take a lot of times, use some techniques(learning curve, bias, variance, model selection and so on) to verify how should you improve the result before you decide to collect more training examples. This is same as the performance issues of programming,
don't guess, measure.
The source codes can download from
github.