diff --git a/app/calcmask.cc b/app/calcmask.cc index 308e3fb..e6ea988 100644 --- a/app/calcmask.cc +++ b/app/calcmask.cc @@ -51,10 +51,12 @@ CalcMask::~CalcMask() { bs_maskgen_delete(maskctx); } -void CalcMask::set_input_frame(cv::Mat &frame) { +void CalcMask::set_input_frame(const cv::Mat &frame, bool multipass) { std::lock_guard hold(lock_frame); *frame_next = frame.clone(); + this->multipass = multipass; + new_frame = true; condition_new_frame.notify_all(); } diff --git a/app/calcmask.h b/app/calcmask.h index fdc251e..12e5e85 100644 --- a/app/calcmask.h +++ b/app/calcmask.h @@ -16,6 +16,7 @@ enum class thread_state_t { RUNNING, DONE }; class CalcMask final { protected: volatile thread_state_t state; + volatile bool multipass; void *maskctx; timestamp_t t0; @@ -58,6 +59,6 @@ class CalcMask final { CalcMask(const std::string& modelname, size_t threads, size_t width, size_t height); ~CalcMask(); - void set_input_frame(cv::Mat &frame); + void set_input_frame(const cv::Mat &frame, bool multipass); void get_output_mask(cv::Mat &out); }; diff --git a/app/deepseg.cc b/app/deepseg.cc index d77e53c..9d38441 100644 --- a/app/deepseg.cc +++ b/app/deepseg.cc @@ -68,6 +68,7 @@ int main(int argc, char* argv[]) try { bool flipHorizontal = false; bool flipVertical = false; + bool multipass = false; std::string vcam = "/dev/video1"; std::string ccam = "/dev/video0"; @@ -89,6 +90,8 @@ int main(int argc, char* argv[]) try { flipHorizontal = !flipHorizontal; } else if (args[arg] == "-V") { flipVertical = !flipVertical; + } else if (args[arg] == "-M") { + multipass = !multipass; } else if (args[arg] == "-v") { if (hasArgument) { vcam = args[++arg]; @@ -181,7 +184,7 @@ int main(int argc, char* argv[]) try { fprintf(stderr, "\n"); fprintf(stderr, "usage:\n"); fprintf(stderr, " backscrub [-?] [-d] [-p] [-c ] [-v ] [-w ] [-h ]\n"); - fprintf(stderr, " [-t ] [-b ] [-m ] [-p ] [-H] [-V]\n"); + fprintf(stderr, " [-t ] [-b ] [-m ] [-p ] [-H] [-V] [-M]\n"); fprintf(stderr, "\n"); fprintf(stderr, "-? Display this usage information\n"); fprintf(stderr, "-d Increase debug level\n"); @@ -200,6 +203,7 @@ int main(int argc, char* argv[]) try { fprintf(stderr, "-p bgblur: Blur the video background\n"); fprintf(stderr, "-H Mirror the output horizontally\n"); fprintf(stderr, "-V Mirror the output vertically\n"); + fprintf(stderr, "-M Activate multi-pass filtering (for aspect ratio mismatch)\n"); exit(1); } @@ -221,6 +225,7 @@ int main(int argc, char* argv[]) try { printf("height: %zu\n", height); printf("flip_h: %s\n", flipHorizontal ? "yes" : "no"); printf("flip_v: %s\n", flipVertical ? "yes" : "no"); + printf("multi: %s\n", multipass ? "yes" : "no"); printf("threads:%zu\n", threads); printf("back: %s\n", s_backg ? s_backg.value().c_str() : "(none)"); printf("model: %s\n\n", s_model ? s_model.value().c_str() : "(none)"); @@ -290,7 +295,7 @@ int main(int argc, char* argv[]) try { // copy new frame to buffer cap.retrieve(raw); ti.retrns = timestamp(); - ai.set_input_frame(raw); + ai.set_input_frame(raw, multipass); ti.copyns = timestamp(); if (raw.rows == 0 || raw.cols == 0) @@ -410,6 +415,7 @@ int main(int argc, char* argv[]) try { " f: toggle FPS display on/off", " b: toggle background display on/off", " m: toggle mask display on/off", + " M: toggle multi-pass processing on/off", " ?: toggle this help text on/off" }; @@ -479,6 +485,10 @@ int main(int argc, char* argv[]) try { showMask = !showMask; break; + case 'M': + multipass = !multipass; + break; + case '?': showHelp = !showHelp; break; diff --git a/lib/libbackscrub.cc b/lib/libbackscrub.cc index 4d00722..f121a06 100644 --- a/lib/libbackscrub.cc +++ b/lib/libbackscrub.cc @@ -37,32 +37,64 @@ struct normalization_t { float offset; }; +struct backscrub_rect_t { + cv::Rect src; + cv::Rect dst; + + backscrub_rect_t() = delete; + backscrub_rect_t(const cv::Rect& _src, const cv::Rect& _dst) : src(_src), dst(_dst) {}; + backscrub_rect_t(const backscrub_rect_t& other) = default; +}; + +struct backscrub_point_t { + size_t x; + size_t y; + + backscrub_point_t() = delete; + backscrub_point_t(size_t _x, size_t _y) : x(_x), y(_y) {}; + backscrub_point_t(const backscrub_point_t& other) = default; +}; + struct backscrub_ctx_t { // Loaded inference model std::unique_ptr model; + // Model interpreter instance std::unique_ptr interpreter; + // Specific model type & input normalization modeltype_t modeltype; normalization_t norm; + // Optional callbacks with caller-provided context void (*ondebug)(void *ctx, const char *msg); void (*onprep)(void *ctx); void (*oninfer)(void *ctx); void (*onmask)(void *ctx); void *caller_ctx; - // Processing state - cv::Mat input; - cv::Mat output; - cv::Rect roidim; - cv::Mat mask; - cv::Mat mroi; - cv::Mat ofinal; - cv::Size blur; + + cv::Rect img_dim; // Image dimensions + + // Single step variables + cv::Mat input; // NN input tensors + cv::Mat output; // NN output tensors + cv::Mat ofinal; // NN output (post-processed mask) + + float src_ratio; // Source image aspect ratio + cv::Rect src_roidim; // Source image rect of interest + cv::Mat mask_region; // Region of the final mask to operate on + + float net_ratio; // NN input image aspect ratio + cv::Rect net_roidim; // NN input image rect of interest + + // Result stitching variables cv::Mat in_u8_bgr; - cv::Rect in_roidim; - float ratio; - float frameratio; + + cv::Size blur; // Size of blur on final mask + cv::Mat mask; // Fully processed mask (full image) + + // Information about the regions to process + std::vector region_rects; }; // Debug helper @@ -203,14 +235,17 @@ void *bs_maskgen_new( ) { // Allocate context backscrub_ctx_t *pctx = new backscrub_ctx_t; + // Take a reference so we can write tidy code with ctx. backscrub_ctx_t &ctx = *pctx; + // Save callbacks ctx.ondebug = ondebug; ctx.onprep = onprep; ctx.oninfer = oninfer; ctx.onmask = onmask; ctx.caller_ctx = caller_ctx; + // Load model ctx.model = tflite::FlatBufferModel::BuildFromFile(modelname.c_str()); @@ -222,7 +257,6 @@ void *bs_maskgen_new( // Determine model type and normalization values ctx.modeltype = get_modeltype(modelname); - ctx.norm = get_normalization(ctx.modeltype); if (modeltype_t::Unknown == ctx.modeltype) { _dbg(ctx, "error: unknown model type '%s'.\n", modelname.c_str()); @@ -230,10 +264,16 @@ void *bs_maskgen_new( return nullptr; } + ctx.norm = get_normalization(ctx.modeltype); + // Build the interpreter tflite::ops::builtin::BuiltinOpResolver resolver; + // custom op for Google Meet network - resolver.AddCustom("Convolution2DTransposeBias", mediapipe::tflite_operations::RegisterConvolution2DTransposeBias()); + resolver.AddCustom( + "Convolution2DTransposeBias", + mediapipe::tflite_operations::RegisterConvolution2DTransposeBias() + ); tflite::InterpreterBuilder builder(*ctx.model, resolver); builder(&ctx.interpreter); @@ -263,22 +303,78 @@ void *bs_maskgen_new( return nullptr; } - ctx.ratio = (float)ctx.input.rows / (float)ctx.input.cols; - ctx.frameratio = (float)height / (float)width; + ctx.img_dim = cv::Rect(0, 0, ctx.input.cols, ctx.input.rows); + + ctx.src_ratio = (float)height / (float)width; + ctx.net_ratio = (float)ctx.input.rows / (float)ctx.input.cols; + + const auto size_src = backscrub_point_t{width, height}; + const auto size_net = backscrub_point_t(ctx.input.cols, ctx.input.rows); + + auto size_filter = size_net; + + /** + * The following code assumes that the source image is larger + * than the input for the neuronal network. + * If src.x * net.y > src.y * net.x we know that the image has a wider aspect ratio then the network. + * If src.x * net.y < src.y * net.x we know that the network has the wider aspect ratio. + * In each case we chose the largest rectangle within the source image that fits within the network. + * This rectangle is than applied multiple times by sliding it across the source image until all of the source is covered. + * When sliding the network window across the source it is ensured that we do an odd number of passes. + * This forces at least one window to cover the center region of the image. + */ + + auto wnd_count = backscrub_point_t{1, 1}; + + if (size_src.x * size_net.y > size_src.y * size_net.x) { + size_filter.x = size_net.x * size_src.y / size_net.y; + size_filter.y = size_src.y; + wnd_count.x = 1 | ((size_src.x + size_filter.x - 1) / size_filter.x); + wnd_count.y = 1; + } else { + size_filter.x = size_src.x; + size_filter.y = size_net.y * size_src.x / size_net.x; + wnd_count.x = 1; + wnd_count.y = 1 | ((size_src.y + size_filter.y - 1) / size_filter.y); + } // initialize mask and model-aspect ROI in center - if (ctx.frameratio < ctx.ratio) { + if (ctx.src_ratio < ctx.net_ratio) { // if frame is wider than model, then use only the frame center - ctx.roidim = cv::Rect((width - height / ctx.ratio) / 2, 0, height / ctx.ratio, height); - ctx.in_roidim = cv::Rect(0, 0, ctx.input.cols, ctx.input.rows); + ctx.src_roidim = cv::Rect((width - height / ctx.net_ratio) / 2, 0, height / ctx.net_ratio, height); + ctx.net_roidim = cv::Rect(0, 0, ctx.input.cols, ctx.input.rows); } else { // if model is wider than the frame, center the frame in the model - ctx.roidim = cv::Rect(0, 0, width, height); - ctx.in_roidim = cv::Rect((ctx.input.cols - ctx.input.rows / ctx.frameratio) / 2, 0, ctx.input.rows / ctx.frameratio, ctx.input.rows); + ctx.src_roidim = cv::Rect(0, 0, width, height); + ctx.net_roidim = cv::Rect((ctx.input.cols - ctx.input.rows / ctx.src_ratio) / 2, 0, ctx.input.rows / ctx.src_ratio, ctx.input.rows); } - ctx.mask = cv::Mat::ones(height, width, CV_8UC1) * 255; - ctx.mroi = ctx.mask(ctx.roidim); + // Item 0 is always a central cut from the image + ctx.region_rects.clear(); + ctx.region_rects.emplace_back(backscrub_rect_t( + ctx.src_roidim, ctx.net_roidim + )); + + for(size_t idx_x = 0; idx_x < wnd_count.x; idx_x++) { + for(size_t idx_y = 0; idx_y < wnd_count.y; idx_x++) { + const size_t sx = wnd_count.x > 1 ? wnd_count.x - 1 : 1; + const size_t sy = wnd_count.y > 1 ? wnd_count.y - 1 : 1; + + size_t dx = size_src.x - size_net.x; + size_t dy = size_src.y - size_net.y; + + dx *= idx_x; + dy *= idx_y; + + dx /= sx; + dy /= sy; + + auto src_rect = cv::Rect(dx, dy, size_filter.x, size_filter.y); + auto dst_rect = cv::Rect(0, 0, ctx.input.cols, ctx.input.rows); + + ctx.region_rects.emplace_back(src_rect, dst_rect); + } + } ctx.in_u8_bgr = cv::Mat(ctx.input.rows, ctx.input.cols, CV_8UC3, cv::Scalar(0, 0, 0)); @@ -296,12 +392,6 @@ void bs_maskgen_delete(void *context) { backscrub_ctx_t &ctx = *((backscrub_ctx_t *)context); - // clear all mask data - ctx.ofinal.deallocate(); - ctx.mask.deallocate(); - ctx.input.deallocate(); - ctx.output.deallocate(); - // drop interpreter (if present) if (ctx.interpreter != nullptr) ctx.interpreter.reset(); @@ -319,110 +409,121 @@ bool bs_maskgen_process(void *context, cv::Mat &frame, cv::Mat &mask) { backscrub_ctx_t &ctx = *((backscrub_ctx_t *)context); - // map ROI - cv::Mat roi = frame(ctx.roidim); + ctx.mask = cv::Mat::ones(ctx.img_dim.height, ctx.img_dim.width, CV_8UC1) * 255; - cv::Mat in_u8_rgb; - cv::Mat in_roi = ctx.in_u8_bgr(ctx.in_roidim); - cv::resize(roi, in_roi, ctx.in_roidim.size()); - cv::cvtColor(ctx.in_u8_bgr, in_u8_rgb, cv::COLOR_BGR2RGB); + for(auto& region: ctx.region_rects) { + ctx.src_roidim = region.src; + ctx.net_roidim = region.dst; - // TODO: can convert directly to float? + ctx.mask_region = ctx.mask(ctx.src_roidim); - // bilateral filter to reduce noise - if (1) { - cv::Mat filtered; - cv::bilateralFilter(in_u8_rgb, filtered, 5, 100.0, 100.0); - in_u8_rgb = filtered; - } + // map ROI + cv::Mat roi = frame(ctx.src_roidim); - // convert to float and normalize values expected by the model - in_u8_rgb.convertTo(ctx.input, CV_32FC3, ctx.norm.scaling, ctx.norm.offset); + cv::Mat in_roi = ctx.in_u8_bgr(ctx.net_roidim); + cv::resize(roi, in_roi, ctx.net_roidim.size()); - if (ctx.onprep) - ctx.onprep(ctx.caller_ctx); + cv::Mat in_u8_rgb; + cv::cvtColor(ctx.in_u8_bgr, in_u8_rgb, cv::COLOR_BGR2RGB); - // Run inference - if (ctx.interpreter->Invoke() != kTfLiteOk) { - _dbg(ctx, "error: failed to interpret video frame\n"); - return false; - } + // TODO: can convert directly to float? - if (ctx.oninfer) - ctx.oninfer(ctx.caller_ctx); + // bilateral filter to reduce noise + if (1) { + cv::Mat filtered; + cv::bilateralFilter(in_u8_rgb, filtered, 5, 100.0, 100.0); + in_u8_rgb = filtered; + } - float* tmp = (float*)ctx.output.data; - uint8_t* out = (uint8_t*)ctx.ofinal.data; + // convert to float and normalize values expected by the model + in_u8_rgb.convertTo(ctx.input, CV_32FC3, ctx.norm.scaling, ctx.norm.offset); - switch (ctx.modeltype) { - case modeltype_t::DeepLab: - // find class with maximum probability - for (unsigned int n = 0; n < ctx.output.total(); n++) { - float maxval = -10000; - size_t maxpos = 0; - - for (size_t i = 0; i < cnum; i++) { - if (tmp[n * cnum + i] > maxval) { - maxval = tmp[n * cnum + i]; - maxpos = i; - } - } + if (ctx.onprep) + ctx.onprep(ctx.caller_ctx); - // set mask to 0 where class == person - uint8_t val = (maxpos == pers ? 0 : 255); - out[n] = (val & 0xE0) | (out[n] >> 3); - } + // Run inference + if (ctx.interpreter->Invoke() != kTfLiteOk) { + _dbg(ctx, "error: failed to interpret video frame\n"); + return false; + } + + if (ctx.oninfer) + ctx.oninfer(ctx.caller_ctx); + + float* tmp = (float*)ctx.output.data; + uint8_t* out = (uint8_t*)ctx.ofinal.data; + + switch (ctx.modeltype) { + case modeltype_t::DeepLab: + // find class with maximum probability + for (unsigned int n = 0; n < ctx.output.total(); n++) { + float maxval = -10000; + size_t maxpos = 0; + + for (size_t i = 0; i < cnum; i++) { + if (tmp[n * cnum + i] > maxval) { + maxval = tmp[n * cnum + i]; + maxpos = i; + } + } - break; + // set mask to 0 where class == person + uint8_t val = (maxpos == pers ? 0 : 255); + out[n] = (val & 0xE0) | (out[n] >> 3); + } - case modeltype_t::BodyPix: - case modeltype_t::MLKitSelfie: + break; - // threshold probability - for (unsigned int n = 0; n < ctx.output.total(); n++) { - // FIXME: hardcoded threshold - uint8_t val = (tmp[n] > 0.65 ? 0 : 255); - out[n] = (val & 0xE0) | (out[n] >> 3); - } + case modeltype_t::BodyPix: + case modeltype_t::MLKitSelfie: - break; + // threshold probability + for (unsigned int n = 0; n < ctx.output.total(); n++) { + // FIXME: hardcoded threshold + uint8_t val = (tmp[n] > 0.65 ? 0 : 255); + out[n] = (val & 0xE0) | (out[n] >> 3); + } - case modeltype_t::GoogleMeetSegmentation: + break; + + case modeltype_t::GoogleMeetSegmentation: + + /* 256 x 144 x 2 tensor for the full model or 160 x 96 x 2 + * tensor for the light model with masks for background + * (channel 0) and person (channel 1) where values are in + * range [MIN_FLOAT, MAX_FLOAT] and user has to apply + * softmax across both channels to yield foreground + * probability in [0.0, 1.0]. + */ + for (unsigned int n = 0; n < ctx.output.total(); n++) { + float exp0 = expf(tmp[2 * n ]); + float exp1 = expf(tmp[2 * n + 1]); + float p0 = exp0 / (exp0 + exp1); + float p1 = exp1 / (exp0 + exp1); + uint8_t val = (p0 < p1 ? 0 : 255); + out[n] = (val & 0xE0) | (out[n] >> 3); + } - /* 256 x 144 x 2 tensor for the full model or 160 x 96 x 2 - * tensor for the light model with masks for background - * (channel 0) and person (channel 1) where values are in - * range [MIN_FLOAT, MAX_FLOAT] and user has to apply - * softmax across both channels to yield foreground - * probability in [0.0, 1.0]. - */ - for (unsigned int n = 0; n < ctx.output.total(); n++) { - float exp0 = expf(tmp[2 * n ]); - float exp1 = expf(tmp[2 * n + 1]); - float p0 = exp0 / (exp0 + exp1); - float p1 = exp1 / (exp0 + exp1); - uint8_t val = (p0 < p1 ? 0 : 255); - out[n] = (val & 0xE0) | (out[n] >> 3); - } + break; - break; + case modeltype_t::Unknown: + _dbg(ctx, "error: unknown model type (%d)\n", ctx.modeltype); + return false; + } - case modeltype_t::Unknown: - _dbg(ctx, "error: unknown model type (%d)\n", ctx.modeltype); - return false; - } + if (ctx.onmask) + ctx.onmask(ctx.caller_ctx); - if (ctx.onmask) - ctx.onmask(ctx.caller_ctx); + // scale up into full-sized mask + cv::Mat tmpbuf; + cv::resize(ctx.ofinal(ctx.net_roidim), tmpbuf, ctx.mask_region.size()); - // scale up into full-sized mask - cv::Mat tmpbuf; - cv::resize(ctx.ofinal(ctx.in_roidim), tmpbuf, ctx.mroi.size()); + // blur at full size for maximum smoothness + cv::blur(tmpbuf, ctx.mask_region, ctx.blur); - // blur at full size for maximum smoothness - cv::blur(tmpbuf, ctx.mroi, ctx.blur); + // copy out + mask = ctx.mask; + } - // copy out - mask = ctx.mask; return true; }