]> git.sesse.net Git - ffmpeg/blob - libavfilter/dnn_backend_native.c
avfilter/setpts: add FR shorthand for FRAME_RATE
[ffmpeg] / libavfilter / dnn_backend_native.c
1 /*
2  * Copyright (c) 2018 Sergey Lavrushkin
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 /**
22  * @file
23  * DNN native backend implementation.
24  */
25
26 #include "dnn_backend_native.h"
27 #include "dnn_srcnn.h"
28 #include "dnn_espcn.h"
29 #include "libavformat/avio.h"
30
31 typedef enum {INPUT, CONV, DEPTH_TO_SPACE} LayerType;
32
33 typedef enum {RELU, TANH, SIGMOID} ActivationFunc;
34
35 typedef struct Layer{
36     LayerType type;
37     float* output;
38     void* params;
39 } Layer;
40
41 typedef struct ConvolutionalParams{
42     int32_t input_num, output_num, kernel_size;
43     ActivationFunc activation;
44     float* kernel;
45     float* biases;
46 } ConvolutionalParams;
47
48 typedef struct InputParams{
49     int height, width, channels;
50 } InputParams;
51
52 typedef struct DepthToSpaceParams{
53     int block_size;
54 } DepthToSpaceParams;
55
56 // Represents simple feed-forward convolutional network.
57 typedef struct ConvolutionalNetwork{
58     Layer* layers;
59     int32_t layers_num;
60 } ConvolutionalNetwork;
61
62 static DNNReturnType set_input_output_native(void* model, DNNData* input, DNNData* output)
63 {
64     ConvolutionalNetwork* network = (ConvolutionalNetwork*)model;
65     InputParams* input_params;
66     ConvolutionalParams* conv_params;
67     DepthToSpaceParams* depth_to_space_params;
68     int cur_width, cur_height, cur_channels;
69     int32_t layer;
70
71     if (network->layers_num <= 0 || network->layers[0].type != INPUT){
72         return DNN_ERROR;
73     }
74     else{
75         input_params = (InputParams*)network->layers[0].params;
76         input_params->width = cur_width = input->width;
77         input_params->height = cur_height = input->height;
78         input_params->channels = cur_channels = input->channels;
79         if (input->data){
80             av_freep(&input->data);
81         }
82         network->layers[0].output = input->data = av_malloc(cur_height * cur_width * cur_channels * sizeof(float));
83         if (!network->layers[0].output){
84             return DNN_ERROR;
85         }
86     }
87
88     for (layer = 1; layer < network->layers_num; ++layer){
89         switch (network->layers[layer].type){
90         case CONV:
91             conv_params = (ConvolutionalParams*)network->layers[layer].params;
92             if (conv_params->input_num != cur_channels){
93                 return DNN_ERROR;
94             }
95             cur_channels = conv_params->output_num;
96             break;
97         case DEPTH_TO_SPACE:
98             depth_to_space_params = (DepthToSpaceParams*)network->layers[layer].params;
99             if (cur_channels % (depth_to_space_params->block_size * depth_to_space_params->block_size) != 0){
100                 return DNN_ERROR;
101             }
102             cur_channels = cur_channels / (depth_to_space_params->block_size * depth_to_space_params->block_size);
103             cur_height *= depth_to_space_params->block_size;
104             cur_width *= depth_to_space_params->block_size;
105             break;
106         default:
107             return DNN_ERROR;
108         }
109         if (network->layers[layer].output){
110             av_freep(&network->layers[layer].output);
111         }
112         network->layers[layer].output = av_malloc(cur_height * cur_width * cur_channels * sizeof(float));
113         if (!network->layers[layer].output){
114             return DNN_ERROR;
115         }
116     }
117
118     output->data = network->layers[network->layers_num - 1].output;
119     output->height = cur_height;
120     output->width = cur_width;
121     output->channels = cur_channels;
122
123     return DNN_SUCCESS;
124 }
125
126 // Loads model and its parameters that are stored in a binary file with following structure:
127 // layers_num,layer_type,layer_parameterss,layer_type,layer_parameters...
128 // For CONV layer: activation_function, input_num, output_num, kernel_size, kernel, biases
129 // For DEPTH_TO_SPACE layer: block_size
130 DNNModel* ff_dnn_load_model_native(const char* model_filename)
131 {
132     DNNModel* model = NULL;
133     ConvolutionalNetwork* network = NULL;
134     AVIOContext* model_file_context;
135     int file_size, dnn_size, kernel_size, i;
136     int32_t layer;
137     LayerType layer_type;
138     ConvolutionalParams* conv_params;
139     DepthToSpaceParams* depth_to_space_params;
140
141     model = av_malloc(sizeof(DNNModel));
142     if (!model){
143         return NULL;
144     }
145
146     if (avio_open(&model_file_context, model_filename, AVIO_FLAG_READ) < 0){
147         av_freep(&model);
148         return NULL;
149     }
150     file_size = avio_size(model_file_context);
151
152     network = av_malloc(sizeof(ConvolutionalNetwork));
153     if (!network){
154         avio_closep(&model_file_context);
155         av_freep(&model);
156         return NULL;
157     }
158     model->model = (void*)network;
159
160     network->layers_num = 1 + (int32_t)avio_rl32(model_file_context);
161     dnn_size = 4;
162
163     network->layers = av_malloc(network->layers_num * sizeof(Layer));
164     if (!network->layers){
165         av_freep(&network);
166         avio_closep(&model_file_context);
167         av_freep(&model);
168         return NULL;
169     }
170
171     for (layer = 0; layer < network->layers_num; ++layer){
172         network->layers[layer].output = NULL;
173         network->layers[layer].params = NULL;
174     }
175     network->layers[0].type = INPUT;
176     network->layers[0].params = av_malloc(sizeof(InputParams));
177     if (!network->layers[0].params){
178         avio_closep(&model_file_context);
179         ff_dnn_free_model_native(&model);
180         return NULL;
181     }
182
183     for (layer = 1; layer < network->layers_num; ++layer){
184         layer_type = (int32_t)avio_rl32(model_file_context);
185         dnn_size += 4;
186         switch (layer_type){
187         case CONV:
188             conv_params = av_malloc(sizeof(ConvolutionalParams));
189             if (!conv_params){
190                 avio_closep(&model_file_context);
191                 ff_dnn_free_model_native(&model);
192                 return NULL;
193             }
194             conv_params->activation = (int32_t)avio_rl32(model_file_context);
195             conv_params->input_num = (int32_t)avio_rl32(model_file_context);
196             conv_params->output_num = (int32_t)avio_rl32(model_file_context);
197             conv_params->kernel_size = (int32_t)avio_rl32(model_file_context);
198             kernel_size = conv_params->input_num * conv_params->output_num *
199                           conv_params->kernel_size * conv_params->kernel_size;
200             dnn_size += 16 + (kernel_size + conv_params->output_num << 2);
201             if (dnn_size > file_size || conv_params->input_num <= 0 ||
202                 conv_params->output_num <= 0 || conv_params->kernel_size <= 0){
203                 avio_closep(&model_file_context);
204                 ff_dnn_free_model_native(&model);
205                 return NULL;
206             }
207             conv_params->kernel = av_malloc(kernel_size * sizeof(float));
208             conv_params->biases = av_malloc(conv_params->output_num * sizeof(float));
209             if (!conv_params->kernel || !conv_params->biases){
210                 avio_closep(&model_file_context);
211                 ff_dnn_free_model_native(&model);
212                 return NULL;
213             }
214             for (i = 0; i < kernel_size; ++i){
215                 conv_params->kernel[i] = av_int2float(avio_rl32(model_file_context));
216             }
217             for (i = 0; i < conv_params->output_num; ++i){
218                 conv_params->biases[i] = av_int2float(avio_rl32(model_file_context));
219             }
220             network->layers[layer].type = CONV;
221             network->layers[layer].params = conv_params;
222             break;
223         case DEPTH_TO_SPACE:
224             depth_to_space_params = av_malloc(sizeof(DepthToSpaceParams));
225             if (!depth_to_space_params){
226                 avio_closep(&model_file_context);
227                 ff_dnn_free_model_native(&model);
228                 return NULL;
229             }
230             depth_to_space_params->block_size = (int32_t)avio_rl32(model_file_context);
231             dnn_size += 4;
232             network->layers[layer].type = DEPTH_TO_SPACE;
233             network->layers[layer].params = depth_to_space_params;
234             break;
235         default:
236             avio_closep(&model_file_context);
237             ff_dnn_free_model_native(&model);
238             return NULL;
239         }
240     }
241
242     avio_closep(&model_file_context);
243
244     if (dnn_size != file_size){
245         ff_dnn_free_model_native(&model);
246         return NULL;
247     }
248
249     model->set_input_output = &set_input_output_native;
250
251     return model;
252 }
253
254 static int set_up_conv_layer(Layer* layer, const float* kernel, const float* biases, ActivationFunc activation,
255                              int32_t input_num, int32_t output_num, int32_t size)
256 {
257     ConvolutionalParams* conv_params;
258     int kernel_size;
259
260     conv_params = av_malloc(sizeof(ConvolutionalParams));
261     if (!conv_params){
262         return DNN_ERROR;
263     }
264     conv_params->activation = activation;
265     conv_params->input_num = input_num;
266     conv_params->output_num = output_num;
267     conv_params->kernel_size = size;
268     kernel_size = input_num * output_num * size * size;
269     conv_params->kernel = av_malloc(kernel_size * sizeof(float));
270     conv_params->biases = av_malloc(conv_params->output_num * sizeof(float));
271     if (!conv_params->kernel || !conv_params->biases){
272         av_freep(&conv_params->kernel);
273         av_freep(&conv_params->biases);
274         av_freep(&conv_params);
275         return DNN_ERROR;
276     }
277     memcpy(conv_params->kernel, kernel, kernel_size * sizeof(float));
278     memcpy(conv_params->biases, biases, output_num * sizeof(float));
279     layer->type = CONV;
280     layer->params = conv_params;
281
282     return DNN_SUCCESS;
283 }
284
285 DNNModel* ff_dnn_load_default_model_native(DNNDefaultModel model_type)
286 {
287     DNNModel* model = NULL;
288     ConvolutionalNetwork* network = NULL;
289     DepthToSpaceParams* depth_to_space_params;
290     int32_t layer;
291
292     model = av_malloc(sizeof(DNNModel));
293     if (!model){
294         return NULL;
295     }
296
297     network = av_malloc(sizeof(ConvolutionalNetwork));
298     if (!network){
299         av_freep(&model);
300         return NULL;
301     }
302     model->model = (void*)network;
303
304     switch (model_type){
305     case DNN_SRCNN:
306         network->layers_num = 4;
307         break;
308     case DNN_ESPCN:
309         network->layers_num = 5;
310         break;
311     default:
312         av_freep(&network);
313         av_freep(&model);
314         return NULL;
315     }
316
317     network->layers = av_malloc(network->layers_num * sizeof(Layer));
318     if (!network->layers){
319         av_freep(&network);
320         av_freep(&model);
321         return NULL;
322     }
323
324     for (layer = 0; layer < network->layers_num; ++layer){
325         network->layers[layer].output = NULL;
326         network->layers[layer].params = NULL;
327     }
328     network->layers[0].type = INPUT;
329     network->layers[0].params = av_malloc(sizeof(InputParams));
330     if (!network->layers[0].params){
331         ff_dnn_free_model_native(&model);
332         return NULL;
333     }
334
335     switch (model_type){
336     case DNN_SRCNN:
337         if (set_up_conv_layer(network->layers + 1, srcnn_conv1_kernel, srcnn_conv1_biases, RELU, 1, 64, 9) != DNN_SUCCESS ||
338             set_up_conv_layer(network->layers + 2, srcnn_conv2_kernel, srcnn_conv2_biases, RELU, 64, 32, 1) != DNN_SUCCESS ||
339             set_up_conv_layer(network->layers + 3, srcnn_conv3_kernel, srcnn_conv3_biases, RELU, 32, 1, 5) != DNN_SUCCESS){
340             ff_dnn_free_model_native(&model);
341             return NULL;
342         }
343         break;
344     case DNN_ESPCN:
345         if (set_up_conv_layer(network->layers + 1, espcn_conv1_kernel, espcn_conv1_biases, TANH, 1, 64, 5) != DNN_SUCCESS ||
346             set_up_conv_layer(network->layers + 2, espcn_conv2_kernel, espcn_conv2_biases, TANH, 64, 32, 3) != DNN_SUCCESS ||
347             set_up_conv_layer(network->layers + 3, espcn_conv3_kernel, espcn_conv3_biases, SIGMOID, 32, 4, 3) != DNN_SUCCESS){
348             ff_dnn_free_model_native(&model);
349             return NULL;
350         }
351         network->layers[4].type = DEPTH_TO_SPACE;
352         depth_to_space_params = av_malloc(sizeof(DepthToSpaceParams));
353         if (!depth_to_space_params){
354             ff_dnn_free_model_native(&model);
355             return NULL;
356         }
357         depth_to_space_params->block_size = 2;
358         network->layers[4].params = depth_to_space_params;
359     }
360
361     model->set_input_output = &set_input_output_native;
362
363     return model;
364 }
365
366 #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
367
368 static void convolve(const float* input, float* output, const ConvolutionalParams* conv_params, int width, int height)
369 {
370     int y, x, n_filter, ch, kernel_y, kernel_x;
371     int radius = conv_params->kernel_size >> 1;
372     int src_linesize = width * conv_params->input_num;
373     int filter_linesize = conv_params->kernel_size * conv_params->input_num;
374     int filter_size = conv_params->kernel_size * filter_linesize;
375
376     for (y = 0; y < height; ++y){
377         for (x = 0; x < width; ++x){
378             for (n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
379                 output[n_filter] = conv_params->biases[n_filter];
380                 for (ch = 0; ch < conv_params->input_num; ++ch){
381                     for (kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
382                         for (kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
383                             output[n_filter] += input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +
384                                                       CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch] *
385                                                 conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
386                                                                     kernel_x * conv_params->input_num + ch];
387                         }
388                     }
389                 }
390                 switch (conv_params->activation){
391                 case RELU:
392                     output[n_filter] = FFMAX(output[n_filter], 0.0);
393                     break;
394                 case TANH:
395                     output[n_filter] = 2.0f  / (1.0f + exp(-2.0f * output[n_filter])) - 1.0f;
396                     break;
397                 case SIGMOID:
398                     output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));
399                 }
400             }
401             output += conv_params->output_num;
402         }
403     }
404 }
405
406 static void depth_to_space(const float* input, float* output, int block_size, int width, int height, int channels)
407 {
408     int y, x, by, bx, ch;
409     int new_channels = channels / (block_size * block_size);
410     int output_linesize = width * channels;
411     int by_linesize = output_linesize / block_size;
412     int x_linesize = new_channels * block_size;
413
414     for (y = 0; y < height; ++y){
415         for (x = 0; x < width; ++x){
416             for (by = 0; by < block_size; ++by){
417                 for (bx = 0; bx < block_size; ++bx){
418                     for (ch = 0; ch < new_channels; ++ch){
419                         output[by * by_linesize + x * x_linesize + bx * new_channels + ch] = input[ch];
420                     }
421                     input += new_channels;
422                 }
423             }
424         }
425         output += output_linesize;
426     }
427 }
428
429 DNNReturnType ff_dnn_execute_model_native(const DNNModel* model)
430 {
431     ConvolutionalNetwork* network = (ConvolutionalNetwork*)model->model;
432     int cur_width, cur_height, cur_channels;
433     int32_t layer;
434     InputParams* input_params;
435     ConvolutionalParams* conv_params;
436     DepthToSpaceParams* depth_to_space_params;
437
438     if (network->layers_num <= 0 || network->layers[0].type != INPUT || !network->layers[0].output){
439         return DNN_ERROR;
440     }
441     else{
442         input_params = (InputParams*)network->layers[0].params;
443         cur_width = input_params->width;
444         cur_height = input_params->height;
445         cur_channels = input_params->channels;
446     }
447
448     for (layer = 1; layer < network->layers_num; ++layer){
449         if (!network->layers[layer].output){
450             return DNN_ERROR;
451         }
452         switch (network->layers[layer].type){
453         case CONV:
454             conv_params = (ConvolutionalParams*)network->layers[layer].params;
455             convolve(network->layers[layer - 1].output, network->layers[layer].output, conv_params, cur_width, cur_height);
456             cur_channels = conv_params->output_num;
457             break;
458         case DEPTH_TO_SPACE:
459             depth_to_space_params = (DepthToSpaceParams*)network->layers[layer].params;
460             depth_to_space(network->layers[layer - 1].output, network->layers[layer].output,
461                            depth_to_space_params->block_size, cur_width, cur_height, cur_channels);
462             cur_height *= depth_to_space_params->block_size;
463             cur_width *= depth_to_space_params->block_size;
464             cur_channels /= depth_to_space_params->block_size * depth_to_space_params->block_size;
465             break;
466         case INPUT:
467             return DNN_ERROR;
468         }
469     }
470
471     return DNN_SUCCESS;
472 }
473
474 void ff_dnn_free_model_native(DNNModel** model)
475 {
476     ConvolutionalNetwork* network;
477     ConvolutionalParams* conv_params;
478     int32_t layer;
479
480     if (*model)
481     {
482         network = (ConvolutionalNetwork*)(*model)->model;
483         for (layer = 0; layer < network->layers_num; ++layer){
484             av_freep(&network->layers[layer].output);
485             if (network->layers[layer].type == CONV){
486                 conv_params = (ConvolutionalParams*)network->layers[layer].params;
487                 av_freep(&conv_params->kernel);
488                 av_freep(&conv_params->biases);
489             }
490             av_freep(&network->layers[layer].params);
491         }
492         av_freep(network);
493         av_freep(model);
494     }
495 }