diff --git a/Marlin/src/HAL/HAL_STM32/HAL.cpp b/Marlin/src/HAL/HAL_STM32/HAL.cpp index d3559420a..677653000 100644 --- a/Marlin/src/HAL/HAL_STM32/HAL.cpp +++ b/Marlin/src/HAL/HAL_STM32/HAL.cpp @@ -30,6 +30,7 @@ #include "HAL.h" #include "../../inc/MarlinConfig.h" +#include "../shared/Delay.h" #if ENABLED(EEPROM_EMULATED_WITH_SRAM) #if STM32F7xx @@ -80,6 +81,11 @@ uint16_t HAL_adc_result; // HAL initialization task void HAL_init(void) { + // Needed for DELAY_NS() / DELAY_US() on CORTEX-M7 + #if (defined(__arm__) || defined(__thumb__)) && __CORTEX_M == 7 + enableCycleCounter(); + #endif + FastIO_init(); #if ENABLED(SDSUPPORT) diff --git a/Marlin/src/HAL/HAL_STM32F7/HAL.h b/Marlin/src/HAL/HAL_STM32F7/HAL.h index c911ff07f..a18fb9b0e 100644 --- a/Marlin/src/HAL/HAL_STM32F7/HAL.h +++ b/Marlin/src/HAL/HAL_STM32F7/HAL.h @@ -153,8 +153,6 @@ extern uint16_t HAL_adc_result; // Public functions // -------------------------------------------------------------------------- - - // Memory related #define __bss_end __bss_end__ diff --git a/Marlin/src/HAL/HAL_TEENSY31_32/HAL.cpp b/Marlin/src/HAL/HAL_TEENSY31_32/HAL.cpp index fa5cdd8d1..db757b17e 100644 --- a/Marlin/src/HAL/HAL_TEENSY31_32/HAL.cpp +++ b/Marlin/src/HAL/HAL_TEENSY31_32/HAL.cpp @@ -26,7 +26,7 @@ #ifdef __MK20DX256__ #include "HAL.h" -#include "../Delay.h" +#include "../shared/Delay.h" #include diff --git a/Marlin/src/HAL/shared/Delay.h b/Marlin/src/HAL/shared/Delay.h index 4cdd47921..9e25cbd51 100644 --- a/Marlin/src/HAL/shared/Delay.h +++ b/Marlin/src/HAL/shared/Delay.h @@ -19,6 +19,7 @@ * along with this program. If not, see . * */ +#pragma once /** * Busy wait delay cycles routines: @@ -28,57 +29,81 @@ * DELAY_US(count): Delay execution in microseconds */ -#ifndef MARLIN_DELAY_H -#define MARLIN_DELAY_H - #include "../../core/macros.h" +#include "../../core/millis_t.h" #if defined(__arm__) || defined(__thumb__) - // https://blueprints.launchpad.net/gcc-arm-embedded/+spec/delay-cycles + #if __CORTEX_M == 7 - #define nop() __asm__ __volatile__("nop;\n\t":::) + // Cortex-M7 can use the cycle counter of the DWT unit + // http://www.anthonyvh.com/2017/05/18/cortex_m-cycle_counter/ - FORCE_INLINE static void __delay_4cycles(uint32_t cy) { // +1 cycle - #if ARCH_PIPELINE_RELOAD_CYCLES < 2 - #define EXTRA_NOP_CYCLES A("nop") - #else - #define EXTRA_NOP_CYCLES "" - #endif + FORCE_INLINE static void enableCycleCounter() { + CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk; - __asm__ __volatile__( - A(".syntax unified") // is to prevent CM0,CM1 non-unified syntax - L("1") - A("subs %[cnt],#1") - EXTRA_NOP_CYCLES - A("bne 1b") - : [cnt]"+r"(cy) // output: +r means input+output - : // input: - : "cc" // clobbers: - ); - } + // Unlock DWT. + DWT->LAR = 0xC5ACCE55; - // Delay in cycles - FORCE_INLINE static void DELAY_CYCLES(uint32_t x) { - - if (__builtin_constant_p(x)) { - #define MAXNOPS 4 - - if (x <= (MAXNOPS)) { - switch (x) { case 4: nop(); case 3: nop(); case 2: nop(); case 1: nop(); } - } - else { // because of +1 cycle inside delay_4cycles - const uint32_t rem = (x - 1) % (MAXNOPS); - switch (rem) { case 3: nop(); case 2: nop(); case 1: nop(); } - if ((x = (x - 1) / (MAXNOPS))) - __delay_4cycles(x); // if need more then 4 nop loop is more optimal - } - #undef MAXNOPS + DWT->CYCCNT = 0; + DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk; } - else if ((x >>= 2)) - __delay_4cycles(x); - } - #undef nop + + FORCE_INLINE volatile uint32_t getCycleCount() { return DWT->CYCCNT; } + + FORCE_INLINE static void DELAY_CYCLES(const uint32_t x) { + const uint32_t endCycles = getCycleCount() + x; + while (PENDING(getCycleCount(), endCycles)) { } + } + + #else + + // https://blueprints.launchpad.net/gcc-arm-embedded/+spec/delay-cycles + + #define nop() __asm__ __volatile__("nop;\n\t":::) + + FORCE_INLINE static void __delay_4cycles(uint32_t cy) { // +1 cycle + #if ARCH_PIPELINE_RELOAD_CYCLES < 2 + #define EXTRA_NOP_CYCLES A("nop") + #else + #define EXTRA_NOP_CYCLES "" + #endif + + __asm__ __volatile__( + A(".syntax unified") // is to prevent CM0,CM1 non-unified syntax + L("1") + A("subs %[cnt],#1") + EXTRA_NOP_CYCLES + A("bne 1b") + : [cnt]"+r"(cy) // output: +r means input+output + : // input: + : "cc" // clobbers: + ); + } + + // Delay in cycles + FORCE_INLINE static void DELAY_CYCLES(uint32_t x) { + + if (__builtin_constant_p(x)) { + #define MAXNOPS 4 + + if (x <= (MAXNOPS)) { + switch (x) { case 4: nop(); case 3: nop(); case 2: nop(); case 1: nop(); } + } + else { // because of +1 cycle inside delay_4cycles + const uint32_t rem = (x - 1) % (MAXNOPS); + switch (rem) { case 3: nop(); case 2: nop(); case 1: nop(); } + if ((x = (x - 1) / (MAXNOPS))) + __delay_4cycles(x); // if need more then 4 nop loop is more optimal + } + #undef MAXNOPS + } + else if ((x >>= 2)) + __delay_4cycles(x); + } + #undef nop + + #endif #elif defined(__AVR__) @@ -144,5 +169,3 @@ // Delay in microseconds #define DELAY_US(x) DELAY_CYCLES( (x) * (F_CPU / 1000000UL) ) - -#endif // MARLIN_DELAY_H