/*
    Copyright 2005-2008 Intel Corporation.  All Rights Reserved.

    This file is part of Threading Building Blocks.

    Threading Building Blocks is free software; you can redistribute it
    and/or modify it under the terms of the GNU General Public License
    version 2 as published by the Free Software Foundation.

    Threading Building Blocks is distributed in the hope that it will be
    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Threading Building Blocks; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

    As a special exception, you may use this file as part of a free software
    library without restriction.  Specifically, if other files instantiate
    templates or use macros or inline functions from this file, or you compile
    this file and link it with other files to produce an executable, this
    file does not by itself cause the resulting executable to be covered by
    the GNU General Public License.  This exception does not however
    invalidate any other reasons why the executable file might be covered by
    the GNU General Public License.
*/

// Files gcc_{alpha,arm,mips,power}.h should be maintained side by side for maximum similarity,
// with an editor that automatically highlights the differences between two or more files.
// Alpha and MIPS both return load/store transaction outcome in the register that held the store argument.
// ARM is the only one of the foursome providing transactional load/store on all 4 sizes.

#ifndef __TBB_machine_H
#error Do not include this file directly; include tbb_machine.h instead
#endif

// This file currently targets "ARM11 MPCore Processor".
// TODO: what are (other) relevant versions, and how to distinguish them?

// ARM convention: "byte"/"halfword"/"word"/"doubleword"
// Transactional primitives "Load Exclusive"/"Store Exclusive" for 8/16/32/64 bits.
#define __TBB_REGISTERSIZE 4
#define __TBB_BIG_ENDIAN 0
#define __TBB_ORDERS_STORES 0
#define __TBB_ORDERS_LOADS 0

// __TBB_fence

    // Note that each TLB entry can affect memory semantics,
    // but these operations must assume that nothing is guaranteed.

    // TODO: investigate further
    // This may look inspired, but it's more a guess at this time.
    // There is even a mismatch between "Data Synchronization Barrier" and "Drain Synchronization Barrier",
    // but that may be an accidential contamination in the specification because:
    // "Note: The Data Synchronization Barrier operation is synonymous with Drain Write Buffer
    // and Data Write Barrier in earlier versions of the architecture."
    // But that may also mean that the Data Synchronization Barrier is a mere StoreStore barrier
    // (to be used with release)?
    // Also, nothing has been done about Should Be Zero for the input register.

    #define __TBB_M(M,mnemonic)                                                                    \
    template<> struct __TBB_fence<false,tbb::M> {                                                  \
        static inline void op() { __asm__ __volatile__ (mnemonic: : :"memory"); }                  \
    };
    __TBB_M(release,"mcr p15, 0, r0, c7, c10, 5") // Data Memory Barrier
    __TBB_M(acquire,"mcr p15, 0, r0, c7, c10, 5") // Data Memory Barrier
    __TBB_M(ordered,"mcr p15, 0, r0, c7, c10, 4") // Data Synchronization Barrier
    #undef __TBB_M

    // TODO validate: No basic operation provides is_ordered=true.
    #define __TBB_MACHINE_TRANSACTION_IS_ORDERED false

// Note that the load for transaction will zero-extend the value,
// which can probably be ignored for __TBB_(Fetch)Op,
// but probably needs special treatment for a comparand,
// which is presumed to be sign-extended?
// Note that taking an unsignedword copy is superfluous for 32 bits.
// TODO: how is that with other architectures?

// Note that, with strex{b,h,,d}, the register for the returned status value
// must be different from the register containing the value to be stored to memory.
// This has been enforced by always using [flr]"=&r"(failure) or [flr]"+&r"(failure)
// as an output resp. input/output operand, i.e., with the '&' earlyclobber constraint modifier,
// even though this is typically neither input nor output except in the one case where it is both.

// TODO: what does this do for non-shared memory, where no monitor seems to be used,
// and what is this separation of local and global monitors?

// Dealing with interrupts:
// TODO: validate, investigate further
// Note that the {ldrex,strex}{b,h,,d} monitor is not cleared by interrupts,
// such as presumably for signals or preemptive context switches,
// so as a workaround the handler and/or O.S. should clear it
// before continuing execution of an interrupted program
// (simple interruptions may not need to, but it's better to err on the side of safety).

// __TBB_CompareStore, __TBB_CompareAndSwap

    // just for information (any delegation is inverted)
    #define __TBB_MACHINE_COMPARE_STORE_NO_SPURIOUS_FAILURE 0

    // including {1,2}-byte versions

    #define __TBB_M(S,T,X)                                                                         \
    template<tbb::memory_semantics M> struct __TBB_CompareStore<S,T,M> {                           \
        static inline bool op (volatile void *ptr, T& comparand, T intended) {                     \
            __TBB_fence_guard<true,__TBB_MACHINE_TRANSACTION_IS_ORDERED,true,M> anonymous;         \
            int32_t failure = 1 /*for failure from initial comparison*/;                           \
            T observed; tbb::internal::atomic_word<S>::unsignedword ucm(comparand);                \
            __asm__ __volatile__ (                                                                 \
                "ldrex" X " %[obs],[%[ptr]]\n\t"                   /* load for transaction      */ \
                "cmp %[obs],%[cmp]\n\t"                            /* compare against comparand */ \
                "bne 1f\n\t"                                       /* exit if not same          */ \
                "strex" X " %[flr],%[itd],[%[ptr]]\n\t"            /* store conditionally       */ \
                "1:"                                               /* the exit                  */ \
                :     "+m"(*(T*)ptr), [obs]"=&r"(observed), [flr]"+&r"(failure)                    \
                : [ptr]"r"(     ptr), [itd]  "r"(intended), [cmp]  "r"(ucm    )                    \
                : "cc" );                                                                          \
            if( !failure ) comparand = observed;                                                   \
            return !failure;                                                                       \
        }                                                                                          \
    };                                                                                             \
    template<tbb::memory_semantics M> struct __TBB_CompareAndSwap<S,T,M> {                         \
        static inline T op( volatile void *ptr, T intended, T comparand ) {                        \
            __TBB_fence_guard<true,__TBB_MACHINE_TRANSACTION_IS_ORDERED,true,M> anonymous;         \
            int32_t failure; T observed; tbb::internal::atomic_word<S>::unsignedword ucm(comparand); \
            __asm__ __volatile__ (                                                                 \
                "0: ldrex" X " %[obs],[%[ptr]]\n\t"                /* load for transaction      */ \
                "cmp %[obs],%[cmp]\n\t"                            /* compare against comparand */ \
                "bne 1f\n\t"                                       /* exit if not same          */ \
                "strex" X " %[flr],%[itd],[%[ptr]]\n\t"            /* store conditionally       */ \
                "teq %[flr],#0\n\tbne 0b\n\t"                      /* retry if unsuccessful     */ \
                "1:"                                               /* the exit                  */ \
                :     "+m"(*(T*)ptr), [obs]"=&r"(observed), [flr]"=&r"(failure)                    \
                : [ptr]"r"(     ptr), [itd]  "r"(intended), [cmp]  "r"(ucm    )                    \
                : "cc" );                                                                          \
            return observed;                                                                       \
        }                                                                                          \
    };

    __TBB_M(1,int8_t ,"b")
    __TBB_M(2,int16_t,"h")
    __TBB_M(4,int32_t,"" )

    #undef __TBB_M

    // TODO: direct implementation, adapt __TBB_MACHINE_COMPARE_STORE_NO_SPURIOUS_FAILURE comment
    #define __TBB_MACHINE_INVERTED_COMPARE_STORE_SWAP_DELEGATION                                   \
        __TBB_M(8,int64_t)

    // Keep this text synchronized between ARM and ESA/390-z/Architecture:
    //     This way of passing variables between an array in memory and pairs of registers
    //     is being used for lack of dedicated constraints like "A" on x86;
    //     hopefully the clobber syntax works (not tested yet at this time)?
    //     Note that C++ probably does not allow taking the address of the parameters,
    //     and it is not clear how much it would benefit performance to even try that.
    template<tbb::memory_semantics M> struct __TBB_CompareAndSwap<8,int64_t,M> {
        static inline int64_t op( volatile void *ptr, int64_t value, int64_t comparand ) {
            __TBB_fence_guard<true,false,true,M> anonymous;
            int32_t failure; int64_t registers[2] = { comparand, value };
            __asm__ __volatile__ (
                "ldm %[rgs],{r10,r11,r12,r13}\n\t"
                "0: ldrexd r8,[%[ptr]]\n\t"                        /* load for transaction      */
                "cmp r8,r10\n\t"                                   /* compare against comparand */
                "bne 1f\n\t"                                       /* exit if not same          */
                "cmp r9,r11\n\t"                                   /* compare against comparand */
                "bne 1f\n\t"                                       /* exit if not same          */
                "strexd %[flr],r12,[%[ptr]]\n\t"                   /* store conditionally       */
                "teq %[flr],#0\n\tbne 0b\n\t"                      /* retry if unsuccessful     */
                "1: stm %[rgs],{r8,9}"                             /* the exit                  */
                :     "+m"(*(int64_t*)ptr), "+m"(registers[0]), [flr]"=&r"(failure  )
                : [ptr]"r"(           ptr),  "m"(registers[1]), [rgs]  "r"(registers)
                : "r8", "r9", "r10", "r11", "r12", "r13", "cc" );  // TODO: are these registers available for use?
            return registers[0];
        }
    };

// __TBB_FetchOp

    // no overflow issues like Alpha and MIPS (requires a specific instruction, and then explicit detection)

    #define __TBB_MM(S,T,X,__TBB_op_op,mnemonic)                                                   \
    template<tbb::memory_semantics M> struct __TBB_FetchOp<S,T,M,__TBB_op_op> {                    \
        static inline T op (volatile void *ptr, T value) {                                         \
            __TBB_fence_guard<true,__TBB_MACHINE_TRANSACTION_IS_ORDERED,true,M> anonymous;         \
            int32_t failure; T intended, result;                                                   \
            __asm__ __volatile__ (                                                                 \
                "0: ldrex" X " %[res],[%[ptr]]\n\t"                /* load for transaction      */ \
                mnemonic " %[itd],%[res],%[val]\n\t"               /* perform operation         */ \
                "strex" X " %[flr],%[itd],[%[ptr]]\n\t"            /* store conditionally       */ \
                "teq %[flr],#0\n\tbne 0b\n\t"                      /* retry if unsuccessful     */ \
                :     "+m"(*(T*)ptr), [res]"=&r"(result), [flr]"=&r"(failure )                     \
                : [ptr]"r"(     ptr), [val]  "r"(value ), [itd]  "r"(intended)                     \
                : "cc" );                                                                          \
            return result;                                                                         \
        }                                                                                          \
    };

    // Note that "swap" instructions are available for sizes 8/16/32 but deprecated against exclusive load and store.

    #define __TBB_M(S,T,X)                                                                         \
    template<tbb::memory_semantics M> struct __TBB_FetchOp<S,T,M,__TBB_op_store> {                 \
        static inline T op (volatile void *ptr, T value) {                                         \
            __TBB_fence_guard<true,__TBB_MACHINE_TRANSACTION_IS_ORDERED,true,M> anonymous;         \
            int32_t failure; T result;                                                             \
            __asm__ __volatile__ (                                                                 \
                "0: ldrex" X " %[res],[%[ptr]]\n\t"                /* load for transaction      */ \
                "strex" X " %[flr],%[val],[%[ptr]]\n\t"            /* store conditionally       */ \
                "teq %[flr],#0\n\tbne 0b\n\t"                      /* retry if unsuccessful     */ \
                :     "+m"(*(T*)ptr), [res]"=&r"(result), [flr]"=&r"(failure)                      \
                : [ptr]"r"(     ptr), [val]  "r"(value )                                           \
                : "cc" );                                                                          \
            return result;                                                                         \
        }                                                                                          \
    };                                                                                             \
    __TBB_MM(S,T,X,__TBB_op_add,"add"  )                                                           \
    __TBB_MM(S,T,X,__TBB_op_and,"and"  )                                                           \
    __TBB_MM(S,T,X,__TBB_op_or ,"orr"  )                                                           \
    __TBB_MM(S,T,X,__TBB_op_xor,"eor"  )

    __TBB_M(1,int8_t ,"b")
    __TBB_M(2,int16_t,"h")
    __TBB_M(4,int32_t,"" )
    // using default delegation for 64 bits (would specific support be worthwhile?)

    #undef __TBB_M

    #undef __TBB_MM

// __TBB_Op(__TBB_store), __TBB_Load

    // TODO: documentation to validate that these are all (including 1-byte and 2-byte) "single-copy atomic"

    #define __TBB_M(S,T,store,load)                                                                \
    template<tbb::memory_semantics M> struct __TBB_Op<S,T,M,__TBB_op_store> {                      \
        static inline void op (volatile void *ptr, T value) {                                      \
            __TBB_fence_guard<true,false,false,M> anonymous;                                       \
            __asm__ __volatile__ (                                                                 \
                store " %[val],[%[ptr]]"                                                           \
                :     "=m"(*(T*)ptr)                                                               \
                : [ptr]"r"(     ptr), [val]"r"(value)                                              \
                : );                                                                               \
        }                                                                                          \
    };                                                                                             \
    template<tbb::memory_semantics M> struct __TBB_Load<S,T,M> {                                   \
        static inline T op (const volatile void *ptr) {                                            \
            __TBB_fence_guard<false,false,true,M> anonymous;                                       \
            T result;                                                                              \
            __asm__ __volatile__ (                                                                 \
                load " %[res],[%[ptr]]"                                                            \
                : [res]"=r"(result)                                                                \
                : [ptr]"r"(ptr), "m"(*(T*)ptr)                                                     \
                : );                                                                               \
            return result;                                                                         \
        }                                                                                          \
    };

    __TBB_M(1,int8_t ,"strb","ldrsb")
    __TBB_M(2,int16_t,"strh","ldrsh")
    __TBB_M(4,int32_t,"str" ,"ldr"  )

    // using default delegation for 64 bits because
    //     "The time order of the accesses to the two memory words is not architecturally defined."

    #undef __TBB_M

// __TBB_Log2

    // This requires that register size will not increase beyond 32 bytes,
    // trading flexibility for performance.
    // TODO: verify at application start-up or buy the flexibility anyway?
    static inline uintptr_t __TBB_machine_lg_cntlz( uintptr_t x ) {
        __TBB_STATIC_ASSERT2(sizeof(uintptr_t)<=__TBB_REGISTERSIZE);
        __asm__ __volatile__ ("clz %[x],%[x]" : [x]"+r"(x) : : );
        return x;
    }
    #define __TBB_Log2(V) ((8*__TBB_REGISTERSIZE-1)-__TBB_machine_lg_cntlz(V))

// no __TBB_Pause

// no __TBB_*Byte