Skip to content

high memory usage for Padam optimizer  #10

Open
@leo-smi

Description

@leo-smi

I'm getting high memor usage (started ad 12 GB and the error occured at 18 GB):

I'm using intel tensorflow plugin on intel iris Xe GPU

image
image
image

ERROR:

ResourceExhaustedError                    Traceback (most recent call last)
Cell In[7], line 89
     86 printlogcallback = tf.keras.callbacks.LambdaCallback(on_batch_end=printlog)            
     88 # treina o modelo
---> 89 History = fold_model.fit(
     90     train_generator_fold, 
     91     batch_size = batch_size,
     92     epochs = epochs,
     93     callbacks=[printlogcallback],
     94     validation_data = (val_generator_fold),
     95     verbose = 1 # mostra a barra de progresso
     96 )
     98 # Suponha que 'model' é o seu modelo treinado
     99 save_model(fold_model, f'./modelos_h5/{key}_fold{fold+1}_batches{batch_size}_epochs{epochs}.h5')

File ~\.conda\envs\directml\lib\site-packages\keras\utils\traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
     67     filtered_tb = _process_traceback_frames(e.__traceback__)
     68     # To get the full stack trace, call:
     69     # `tf.debugging.disable_traceback_filtering()`
---> 70     raise e.with_traceback(filtered_tb) from None
     71 finally:
     72     del filtered_tb

File ~\.conda\envs\directml\lib\site-packages\tensorflow\python\eager\execute.py:54, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     52 try:
     53   ctx.ensure_initialized()
---> 54   tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     55                                       inputs, attrs, num_outputs)
     56 except core._NotOkStatusException as e:
     57   if name is not None:

ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/model/block_1_pad/Slice_1' defined at (most recent call last):
    File "C:\Users\leand\.conda\envs\directml\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\leand\.conda\envs\directml\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
      app.start()
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\ipykernel\kernelapp.py", line 711, in start
      self.io_loop.start()
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\leand\.conda\envs\directml\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "C:\Users\leand\.conda\envs\directml\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "C:\Users\leand\.conda\envs\directml\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\ipykernel\ipkernel.py", line 411, in do_execute
      res = shell.run_cell(
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\ipykernel\zmqshell.py", line 531, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\IPython\core\interactiveshell.py", line 2945, in run_cell
      result = self._run_cell(
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\IPython\core\interactiveshell.py", line 3000, in _run_cell
      return runner(coro)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\IPython\core\interactiveshell.py", line 3203, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\IPython\core\interactiveshell.py", line 3382, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\IPython\core\interactiveshell.py", line 3442, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\leand\AppData\Local\Temp\ipykernel_17272\2278945961.py", line 89, in <module>
      History = fold_model.fit(
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\keras\engine\training.py", line 997, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "C:\Users\leand\.conda\envs\directml\lib\site-packages\keras\optimizers\optimizer_v1.py", line 872, in minimize
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradient_tape/model/block_1_pad/Slice_1'
OOM when allocating tensor with shape[8,96,100,100] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator PluggableDevice_0_bfc
	 [[{{node gradient_tape/model/block_1_pad/Slice_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_20953]

MY MODEL:

# MODELO DE BASE
# https://keras.io/api/applications/

def model(modelo):

    # modelo base
    base_model = modelo(
        include_top = False,
        weights = "imagenet", # modelo pré-treinado para não utilizar pesos aleatórios
        input_shape = (200, 200, 3) # 200W X 200H X 3 CANAIS
    )
    
    # NOVO MODELO A PARTIR DO MODELO DE BASE
    n_category = 9 # number of categories
    
    new_model = base_model.output
    new_model = GlobalAveragePooling2D()(new_model)
    new_model = Dropout(0.25)(new_model)
    
    # camada de predição (saída)
    prediction_layer = Dense(n_category, activation='softmax')(new_model) # 9 tipos de tomate

    # acoplando as camadas de entrada e saída
    new_model = Model(
        inputs = base_model.input,  # a entrada é com base no dataset
        outputs = prediction_layer  # a saída é com base no número de categorias
    )
    
    return new_model

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions