This is a follow-up question for An Updated Multi-dimensional Image Data Structure with Variadic Template Functions in C++. To apply pixel operation into an Image
class, I implemented apply_each_pixel
and apply_each_pixel_openmp
template functions to compare the performance between using std::execution::par
and using Openmp. After tested multiple times, I found that apply_each_pixel_openmp
runs faster than apply_each_pixel
under Visual Studio environment on Windows.
Compiler Information:
Microsoft (R) C/C++ Optimizing Compiler Version 19.44.34823.2 for x64 Copyright (C) Microsoft Corporation. All rights reserved.
The experimental implementation
apply_each_pixel
template function implementation// apply_each_pixel template function implementation template<class ExPo, class ElementT, class F, class... Args> requires(std::is_execution_policy_v<std::remove_cvref_t<ExPo>>) constexpr static auto apply_each_pixel(ExPo execution_policy, const Image<ElementT>& input, F operation, Args&&... args) { auto transformed_image_data = recursive_transform<1>(execution_policy, [&](auto&& pixel_input) { return std::invoke(operation, pixel_input, args...); }, input.getImageData()); return Image<std::ranges::range_value_t<decltype(transformed_image_data)>>(transformed_image_data, input.getSize()); }
apply_each_pixel_openmp
template function implementation// apply_each_pixel_openmp template function implementation template<class ElementT, class F, class... Args> constexpr static auto apply_each_pixel_openmp(const Image<ElementT>& input, F operation, Args&&... args) { std::vector<std::invoke_result_t<F, ElementT, Args...>> output_vector; auto input_count = input.count(); auto input_vector = input.getImageData(); output_vector.resize(input_count); #pragma omp parallel for for (std::size_t i = 0; i < input_count; ++i) { output_vector[i] = std::invoke(operation, input_vector[i], args...); } return Image<std::invoke_result_t<F, ElementT>>(output_vector, input.getSize()); }
recursive_transform
template function implementation// recursive_transform template function for the multiple parameters cases (the version with unwrap_level) template<std::size_t unwrap_level = 1, std::copy_constructible F, class Arg1, class... Args> requires(unwrap_level <= recursive_depth<Arg1>()) constexpr auto recursive_transform(const F& f, const Arg1& arg1, const Args&... args) { if constexpr (unwrap_level > 0) { recursive_variadic_invoke_result_t<unwrap_level, F, Arg1, Args...> output{}; transform( std::inserter(output, std::ranges::end(output)), [&f](auto&& element1, auto&&... elements) { return recursive_transform<unwrap_level - 1>(f, element1, elements...); }, std::ranges::cbegin(arg1), std::ranges::cend(arg1), std::ranges::cbegin(args)... ); return output; } else if constexpr (std::regular_invocable<F, Arg1, Args...>) { return std::invoke(f, arg1, args...); } else { static_assert(!std::regular_invocable<F, Arg1, Args...>, "The function passed to recursive_transform() cannot be invoked" "with the element types at the given recursion level."); } } // recursive_transform template function for std::array (the version with unwrap_level) template< std::size_t unwrap_level = 1, template<class, std::size_t> class Container, typename T, std::size_t N, typename F, class... Args> requires (std::ranges::input_range<Container<T, N>>) constexpr auto recursive_transform(const F& f, const Container<T, N>& arg1, const Args&... args) { if constexpr (unwrap_level > 0) { recursive_array_invoke_result_t<unwrap_level, F, Container, T, N, Args...> output{}; transform( std::ranges::begin(output), [&f](auto&& element1, auto&&... elements) { return recursive_transform<unwrap_level - 1>(f, element1, elements...); }, std::ranges::cbegin(arg1), std::ranges::cend(arg1), std::ranges::cbegin(args)... ); return output; } else if constexpr(std::regular_invocable<F, Container<T, N>, Args...>) { return std::invoke(f, arg1, args...); } else { static_assert(!std::regular_invocable<F, Container<T, N>, Args...>, "The function passed to recursive_transform() cannot be invoked" "with the element types at the given recursion level."); } } // recursive_transform template function implementation (the version with unwrap_level, with execution policy) template<std::size_t unwrap_level = 1, class ExPo, class T, class F> requires (std::is_execution_policy_v<std::remove_cvref_t<ExPo>> && unwrap_level <= recursive_depth<T>()) constexpr auto recursive_transform(ExPo execution_policy, const F& f, const T& input) { if constexpr (unwrap_level > 0) { recursive_invoke_result_t<unwrap_level, F, T> output{}; output.resize(input.size()); std::mutex mutex; std::transform(execution_policy, std::ranges::cbegin(input), std::ranges::cend(input), std::ranges::begin(output), [&](auto&& element) { std::lock_guard lock(mutex); return recursive_transform<unwrap_level - 1>(execution_policy, f, element); }); return output; } else if constexpr (std::regular_invocable<F, T>) { return std::invoke(f, input); } else { static_assert(!std::regular_invocable<F, T>, "The function passed to recursive_transform() cannot be invoked" "with the element types at the given recursion level."); } } #ifdef USE_BOOST_ITERATOR #include <boost/iterator/zip_iterator.hpp> // recursive_transform template function for the binary operation cases (the version with unwrap_level, with execution policy) template<std::size_t unwrap_level = 1, class ExPo, class T1, class T2, class F> requires (std::is_execution_policy_v<std::remove_cvref_t<ExPo>> && unwrap_level <= recursive_depth<T>()) constexpr auto recursive_transform(ExPo execution_policy, const F& f, const T1& input1, const T2& input2) { if constexpr (unwrap_level > 0) { recursive_variadic_invoke_result_t<unwrap_level, F, T1, T2> output{}; assert(input1.size() == input2.size()); std::mutex mutex; // Reference: https://stackoverflow.com/a/10457201/6667035 // Reference: https://www.boost.org/doc/libs/1_76_0/libs/iterator/doc/zip_iterator.html std::for_each(execution_policy, boost::make_zip_iterator( boost::make_tuple(std::ranges::cbegin(input1), std::ranges::cbegin(input2)) ), boost::make_zip_iterator( boost::make_tuple(std::ranges::cend(input1), std::ranges::cend(input2)) ), [&](auto&& elements) { auto result = recursive_transform<unwrap_level - 1>(execution_policy, f, boost::get<0>(elements), boost::get<1>(elements)); std::lock_guard lock(mutex); output.emplace_back(std::move(result)); } ); return output; } else { return std::invoke(f, input1, input2); } } #endif
The usage of apply_each_pixel
and apply_each_pixel_openmp
template functions:
/* Developed by Jimmy Hu */ #include <chrono> #include "../base_types.h" #include "../basic_functions.h" #include "../image.h" #include "../image_io.h" #include "../image_operations.h" template<class ExPo, class ElementT> requires (std::is_execution_policy_v<std::remove_cvref_t<ExPo>>) constexpr static auto Processing( ExPo execution_policy, const TinyDIP::Image<ElementT>& input, std::ostream& os = std::cout ) { auto hsv_image = TinyDIP::rgb2hsv(execution_policy, input); auto start1 = std::chrono::system_clock::now(); auto processed_hsv_image1 = TinyDIP::apply_each_pixel( std::execution::par, hsv_image, [&](TinyDIP::HSV pixel) -> TinyDIP::HSV { TinyDIP::HSV pixel_for_filling; pixel_for_filling.channels[0] = 0; pixel_for_filling.channels[1] = 1; pixel_for_filling.channels[2] = 255; if (pixel.channels[2] > 100 && pixel.channels[2] < 220) { pixel = pixel_for_filling; } return pixel; } ); auto end1 = std::chrono::system_clock::now(); std::chrono::duration<double> elapsed_seconds1 = end1 - start1; os << "elapsed time with using parallel execution policy: " << elapsed_seconds1.count() << '\n'; auto start2 = std::chrono::system_clock::now(); auto processed_hsv_image2 = TinyDIP::apply_each_pixel_openmp(hsv_image, [&](TinyDIP::HSV pixel) -> TinyDIP::HSV { TinyDIP::HSV pixel_for_filling; pixel_for_filling.channels[0] = 0; pixel_for_filling.channels[1] = 1; pixel_for_filling.channels[2] = 255; if (pixel.channels[2] > 100 && pixel.channels[2] < 220) { pixel = pixel_for_filling; } return pixel; }); auto end2 = std::chrono::system_clock::now(); std::chrono::duration<double> elapsed_seconds2 = end2 - start2; os << "elapsed time with using Openmp: " << elapsed_seconds2.count() << '\n'; os << TinyDIP::sum(TinyDIP::difference(processed_hsv_image1, processed_hsv_image2)) << '\n'; auto output_image = TinyDIP::hsv2rgb(execution_policy, processed_hsv_image1); return output_image; } int main() { auto start = std::chrono::system_clock::now(); std::string image_filename = "1.bmp"; auto image_input = TinyDIP::bmp_read(image_filename.c_str(), true); image_input = TinyDIP::copyResizeBicubic(image_input, 3 * image_input.getWidth(), 3 * image_input.getHeight()); TinyDIP::bmp_write("BeforeProcessing", image_input); auto output_image = Processing(std::execution::seq, image_input); TinyDIP::bmp_write("AfterProcessing", output_image); auto end = std::chrono::system_clock::now(); std::chrono::duration<double> elapsed_seconds = end - start; std::time_t end_time = std::chrono::system_clock::to_time_t(end); std::cout << "Computation finished at " << std::ctime(&end_time) << "elapsed time: " << elapsed_seconds.count() << '\n'; return EXIT_SUCCESS; }
The output of the test code above:
Width of the input image: 454 Height of the input image: 341 Size of the input image(Byte): 464442 elapsed time with using parallel execution policy: 0.142722 elapsed time with using Openmp: 0.0333061 {0, 0, 0} Computation finished at Thu Feb 13 14:23:46 2025 elapsed time: 0.563649
All suggestions are welcome.
All suggestions are welcome.
The summary information:
Which question it is a follow-up to?
An Updated Multi-dimensional Image Data Structure with Variadic Template Functions in C++
What changes has been made in the code since last question?
I implemented
apply_each_pixel
andapply_each_pixel_openmp
template functions in this post.Why a new review is being asked for?
Please review the implementation of
apply_each_pixel
andapply_each_pixel_openmp
template functions and its tests.
std::par
one is using complicated recursion down to the point of doing a single pixel in each task. That will generate many more tasks than are needed (so much more task creation and scheduling cost). Can you not use a simple parallel loop in the C++ code (maybestd::for_each(std::execution::par_unseq,...)
?\$\endgroup\$