File size: 7,847 Bytes
ac5c83c a8f2672 ac5c83c a8f2672 ac5c83c 1528081 ac5c83c a8f2672 ac5c83c a8f2672 ac5c83c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
#include <iostream>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
using namespace std;
using namespace cv;
using namespace dnn;
vector< pair<cv::dnn::Backend, cv::dnn::Target> > backendTargetPairs = {
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU),
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA),
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16),
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU),
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU)};
std::string keys =
"{ help h | | Print help message. }"
"{ model m | text_detection_cn_ppocrv3_2023may.onnx | Usage: Set model type, defaults to text_detection_ch_ppocrv3_2023may.onnx }"
"{ input i | | Usage: Path to input image or video file. Skip this argument to capture frames from a camera.}"
"{ width | 736 | Usage: Resize input image to certain width, default = 736. It should be multiple by 32.}"
"{ height | 736 | Usage: Resize input image to certain height, default = 736. It should be multiple by 32.}"
"{ binary_threshold | 0.3 | Usage: Threshold of the binary map, default = 0.3.}"
"{ polygon_threshold | 0.5 | Usage: Threshold of polygons, default = 0.5.}"
"{ max_candidates | 200 | Usage: Set maximum number of polygon candidates, default = 200.}"
"{ unclip_ratio | 2.0 | Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.}"
"{ save s | true | Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.}"
"{ viz v | true | Usage: Specify to open a new window to show results. Invalid in case of camera input.}"
"{ backend bt | 0 | Choose one of computation backends: "
"0: (default) OpenCV implementation + CPU, "
"1: CUDA + GPU (CUDA), "
"2: CUDA + GPU (CUDA FP16), "
"3: TIM-VX + NPU, "
"4: CANN + NPU}";
class PPOCRDet {
public:
PPOCRDet(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3,
float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0,
dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh),
polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio),
backendId(bId), targetId(tId)
{
this->model = TextDetectionModel_DB(readNet(modelPath));
this->model.setPreferableBackend(backendId);
this->model.setPreferableTarget(targetId);
this->model.setBinaryThreshold(binaryThreshold);
this->model.setPolygonThreshold(polygonThreshold);
this->model.setUnclipRatio(unclipRatio);
this->model.setMaxCandidates(maxCandidates);
this->model.setInputParams(1.0 / 255.0, inputSize, Scalar(122.67891434, 116.66876762, 104.00698793));
}
pair< vector<vector<Point>>, vector<float> > infer(Mat image) {
CV_Assert(image.rows == this->inputSize.height && "height of input image != net input size ");
CV_Assert(image.cols == this->inputSize.width && "width of input image != net input size ");
vector<vector<Point>> pt;
vector<float> confidence;
this->model.detect(image, pt, confidence);
return make_pair< vector<vector<Point>> &, vector< float > &>(pt, confidence);
}
private:
string modelPath;
TextDetectionModel_DB model;
Size inputSize;
float binaryThreshold;
float polygonThreshold;
int maxCandidates;
double unclipRatio;
dnn::Backend backendId;
dnn::Target targetId;
};
Mat visualize(Mat image, pair< vector<vector<Point>>, vector<float> >&results, double fps=-1, Scalar boxColor=Scalar(0, 255, 0), Scalar textColor=Scalar(0, 0, 255), bool isClosed=true, int thickness=2)
{
Mat output;
image.copyTo(output);
if (fps > 0)
putText(output, format("FPS: %.2f", fps), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor);
polylines(output, results.first, isClosed, boxColor, thickness);
return output;
}
int main(int argc, char** argv)
{
CommandLineParser parser(argc, argv, keys);
parser.about("Use this program to run Real-time Scene Text Detection with Differentiable Binarization in opencv Zoo using OpenCV.");
if (parser.has("help"))
{
parser.printMessage();
return 0;
}
int backendTargetid = parser.get<int>("backend");
String modelName = parser.get<String>("model");
if (modelName.empty())
{
CV_Error(Error::StsError, "Model file " + modelName + " not found");
}
Size inpSize(parser.get<int>("width"), parser.get<int>("height"));
float binThresh = parser.get<float>("binary_threshold");
float polyThresh = parser.get<float>("polygon_threshold");
int maxCand = parser.get<int>("max_candidates");
double unRatio = parser.get<float>("unclip_ratio");
bool save = parser.get<bool>("save");
bool viz = parser.get<bool>("viz");
PPOCRDet model(modelName, inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
//! [Open a video file or an image file or a camera stream]
VideoCapture cap;
if (parser.has("input"))
cap.open(parser.get<String>("input"));
else
cap.open(0);
if (!cap.isOpened())
CV_Error(Error::StsError, "Cannot open video or file");
Mat originalImage;
static const std::string kWinName = modelName;
while (waitKey(1) < 0)
{
cap >> originalImage;
if (originalImage.empty())
{
if (parser.has("input"))
{
cout << "Frame is empty" << endl;
break;
}
else
continue;
}
int originalW = originalImage.cols;
int originalH = originalImage.rows;
double scaleHeight = originalH / double(inpSize.height);
double scaleWidth = originalW / double(inpSize.width);
Mat image;
resize(originalImage, image, inpSize);
// inference
TickMeter tm;
tm.start();
pair< vector<vector<Point>>, vector<float> > results = model.infer(image);
tm.stop();
auto x = results.first;
// Scale the results bounding box
for (auto &pts : results.first)
{
for (int i = 0; i < 4; i++)
{
pts[i].x = int(pts[i].x * scaleWidth);
pts[i].y = int(pts[i].y * scaleHeight);
}
}
originalImage = visualize(originalImage, results, tm.getFPS());
tm.reset();
if (parser.has("input"))
{
if (save)
{
cout << "Result image saved to result.jpg\n";
imwrite("result.jpg", originalImage);
}
if (viz)
{
imshow(kWinName, originalImage);
waitKey(0);
}
}
else
imshow(kWinName, originalImage);
}
return 0;
}
|