/* Copyright (C) 2016-2022 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This file is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
. */
/* {{{ Includes. */
/* We want GET_MODE_SIZE et al to return integers, please. */
#define IN_TARGET_CODE 1
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "target.h"
#include "memmodel.h"
#include "rtl.h"
#include "tree.h"
#include "df.h"
#include "tm_p.h"
#include "stringpool.h"
#include "optabs.h"
#include "regs.h"
#include "emit-rtl.h"
#include "recog.h"
#include "diagnostic-core.h"
#include "insn-attr.h"
#include "fold-const.h"
#include "calls.h"
#include "explow.h"
#include "expr.h"
#include "output.h"
#include "cfgrtl.h"
#include "langhooks.h"
#include "builtins.h"
#include "omp-general.h"
#include "print-rtl.h"
#include "attribs.h"
#include "varasm.h"
#include "intl.h"
#include "rtl-iter.h"
#include "dwarf2.h"
#include "gimple.h"
/* This file should be included last. */
#include "target-def.h"
/* }}} */
/* {{{ Global variables. */
/* Constants used by FP instructions. */
static REAL_VALUE_TYPE dconst4, dconst1over2pi;
static bool ext_gcn_constants_init = 0;
/* Holds the ISA variant, derived from the command line parameters. */
int gcn_isa = 3; /* Default to GCN3. */
/* Reserve this much space for LDS (for propagating variables from
worker-single mode to worker-partitioned mode), per workgroup. Global
analysis could calculate an exact bound, but we don't do that yet.
We want to permit full occupancy, so size accordingly. */
/* Use this as a default, but allow it to grow if the user requests a large
amount of gang-private shared-memory space. */
static int acc_lds_size = 0x600;
#define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */
#define ACC_LDS_SIZE acc_lds_size
#define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */
#define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
: flag_openmp ? OMP_LDS_SIZE \
: OTHER_LDS_SIZE)
static int gang_private_hwm = 32;
static hash_map lds_allocs;
/* The number of registers usable by normal non-kernel functions.
The SGPR count includes any special extra registers such as VCC. */
#define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC
#define MAX_NORMAL_VGPR_COUNT 24
/* }}} */
/* {{{ Initialization and options. */
/* Initialize machine_function. */
static struct machine_function *
gcn_init_machine_status (void)
{
struct machine_function *f;
f = ggc_cleared_alloc ();
if (TARGET_GCN3)
f->use_flat_addressing = true;
return f;
}
/* Implement TARGET_OPTION_OVERRIDE.
Override option settings where defaults are variable, or we have specific
needs to consider. */
static void
gcn_option_override (void)
{
init_machine_status = gcn_init_machine_status;
/* The HSA runtime does not respect ELF load addresses, so force PIE. */
if (!flag_pie)
flag_pie = 2;
if (!flag_pic)
flag_pic = flag_pie;
gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
/* The default stack size needs to be small for offload kernels because
there may be many, many threads. Also, a smaller stack gives a
measureable performance boost. But, a small stack is insufficient
for running the testsuite, so we use a larger default for the stand
alone case. */
if (stack_size_opt == -1)
{
if (flag_openacc || flag_openmp)
/* 512 bytes per work item = 32kB total. */
stack_size_opt = 512 * 64;
else
/* 1MB total. */
stack_size_opt = 1048576;
}
/* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
worker broadcasts. */
if (gang_private_size_opt == -1)
gang_private_size_opt = 512;
else if (gang_private_size_opt < gang_private_hwm)
gang_private_size_opt = gang_private_hwm;
else if (gang_private_size_opt >= acc_lds_size - 1024)
{
/* We need some space for reductions and worker broadcasting. If the
user requests a large amount of gang-private LDS space, we might not
have enough left for the former. Increase the LDS allocation in that
case, although this may reduce the maximum occupancy on the
hardware. */
acc_lds_size = gang_private_size_opt + 1024;
if (acc_lds_size > 32768)
acc_lds_size = 32768;
}
/* The xnack option is a placeholder, for now. */
if (flag_xnack)
sorry ("XNACK support");
}
/* }}} */
/* {{{ Attributes. */
/* This table defines the arguments that are permitted in
__attribute__ ((amdgpu_hsa_kernel (...))).
The names and values correspond to the HSA metadata that is encoded
into the assembler file and binary. */
static const struct gcn_kernel_arg_type
{
const char *name;
const char *header_pseudo;
machine_mode mode;
/* This should be set to -1 or -2 for a dynamically allocated register
number. Use -1 if this argument contributes to the user_sgpr_count,
-2 otherwise. */
int fixed_regno;
} gcn_kernel_arg_types[] = {
{"exec", NULL, DImode, EXEC_REG},
#define PRIVATE_SEGMENT_BUFFER_ARG 1
{"private_segment_buffer",
".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
#define DISPATCH_PTR_ARG 2
{"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
#define QUEUE_PTR_ARG 3
{"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
#define KERNARG_SEGMENT_PTR_ARG 4
{"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
{"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
#define FLAT_SCRATCH_INIT_ARG 6
{"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
{"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
#define WORKGROUP_ID_X_ARG 8
{"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
{"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
{"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
{"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
{"private_segment_wave_offset",
".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
#define WORK_ITEM_ID_X_ARG 13
{"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
#define WORK_ITEM_ID_Y_ARG 14
{"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
#define WORK_ITEM_ID_Z_ARG 15
{"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
};
static const long default_requested_args
= (1 << PRIVATE_SEGMENT_BUFFER_ARG)
| (1 << DISPATCH_PTR_ARG)
| (1 << QUEUE_PTR_ARG)
| (1 << KERNARG_SEGMENT_PTR_ARG)
| (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
| (1 << WORKGROUP_ID_X_ARG)
| (1 << WORK_ITEM_ID_X_ARG)
| (1 << WORK_ITEM_ID_Y_ARG)
| (1 << WORK_ITEM_ID_Z_ARG);
/* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
This function also sets the default values for some arguments.
Return true on success, with ARGS populated. */
static bool
gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
tree list)
{
bool err = false;
args->requested = default_requested_args;
args->nargs = 0;
for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
args->reg[a] = -1;
for (; list; list = TREE_CHAIN (list))
{
const char *str;
if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
{
error ("% attribute requires string constant "
"arguments");
break;
}
str = TREE_STRING_POINTER (TREE_VALUE (list));
int a;
for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
{
if (!strcmp (str, gcn_kernel_arg_types[a].name))
break;
}
if (a == GCN_KERNEL_ARG_TYPES)
{
error ("unknown specifier %qs in % attribute",
str);
err = true;
break;
}
if (args->requested & (1 << a))
{
error ("duplicated parameter specifier %qs in % "
"attribute", str);
err = true;
break;
}
args->requested |= (1 << a);
args->order[args->nargs++] = a;
}
/* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
requesting WORK_ITEM_ID_X_ARG. */
if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
args->requested |= (1 << WORK_ITEM_ID_X_ARG);
int sgpr_regno = FIRST_SGPR_REG;
args->nsgprs = 0;
for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
{
if (!(args->requested & (1 << a)))
continue;
if (gcn_kernel_arg_types[a].fixed_regno >= 0)
args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
else
{
int reg_count;
switch (gcn_kernel_arg_types[a].mode)
{
case E_SImode:
reg_count = 1;
break;
case E_DImode:
reg_count = 2;
break;
case E_TImode:
reg_count = 4;
break;
default:
gcc_unreachable ();
}
args->reg[a] = sgpr_regno;
sgpr_regno += reg_count;
if (gcn_kernel_arg_types[a].fixed_regno == -1)
args->nsgprs += reg_count;
}
}
if (sgpr_regno > FIRST_SGPR_REG + 16)
{
error ("too many arguments passed in sgpr registers");
}
return err;
}
/* Referenced by TARGET_ATTRIBUTE_TABLE.
Validates target specific attributes. */
static tree
gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
tree args, int, bool *no_add_attrs)
{
if (!FUNC_OR_METHOD_TYPE_P (*node))
{
warning (OPT_Wattributes, "%qE attribute only applies to functions",
name);
*no_add_attrs = true;
return NULL_TREE;
}
/* Can combine regparm with all attributes but fastcall, and thiscall. */
if (is_attribute_p ("gcnhsa_kernel", name))
{
struct gcn_kernel_args kernelarg;
if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
*no_add_attrs = true;
return NULL_TREE;
}
return NULL_TREE;
}
/* Implement TARGET_ATTRIBUTE_TABLE.
Create target-specific __attribute__ types. */
static const struct attribute_spec gcn_attribute_table[] = {
/* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
affects_type_identity } */
{"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
/* End element. */
{NULL, 0, 0, false, false, false, false, NULL, NULL}
};
/* }}} */
/* {{{ Registers and modes. */
/* Implement TARGET_SCALAR_MODE_SUPPORTED_P. */
bool
gcn_scalar_mode_supported_p (scalar_mode mode)
{
return (mode == BImode
|| mode == QImode
|| mode == HImode /* || mode == HFmode */
|| mode == SImode || mode == SFmode
|| mode == DImode || mode == DFmode
|| mode == TImode);
}
/* Implement TARGET_CLASS_MAX_NREGS.
Return the number of hard registers needed to hold a value of MODE in
a register of class RCLASS. */
static unsigned char
gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
{
/* Scalar registers are 32bit, vector registers are in fact tuples of
64 lanes. */
if (rclass == VGPR_REGS)
{
if (vgpr_1reg_mode_p (mode))
return 1;
if (vgpr_2reg_mode_p (mode))
return 2;
/* TImode is used by DImode compare_and_swap. */
if (mode == TImode)
return 4;
}
else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
return 2;
return CEIL (GET_MODE_SIZE (mode), 4);
}
/* Implement TARGET_HARD_REGNO_NREGS.
Return the number of hard registers needed to hold a value of MODE in
REGNO. */
unsigned int
gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
{
return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
}
/* Implement TARGET_HARD_REGNO_MODE_OK.
Return true if REGNO can hold value in MODE. */
bool
gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
{
/* Treat a complex mode as if it were a scalar mode of the same overall
size for the purposes of allocating hard registers. */
if (COMPLEX_MODE_P (mode))
switch (mode)
{
case E_CQImode:
case E_CHImode:
mode = SImode;
break;
case E_CSImode:
mode = DImode;
break;
case E_CDImode:
mode = TImode;
break;
case E_HCmode:
mode = SFmode;
break;
case E_SCmode:
mode = DFmode;
break;
default:
/* Not supported. */
return false;
}
switch (regno)
{
case FLAT_SCRATCH_LO_REG:
case XNACK_MASK_LO_REG:
case TBA_LO_REG:
case TMA_LO_REG:
return (mode == SImode || mode == DImode);
case VCC_LO_REG:
case EXEC_LO_REG:
return (mode == BImode || mode == SImode || mode == DImode);
case M0_REG:
case FLAT_SCRATCH_HI_REG:
case XNACK_MASK_HI_REG:
case TBA_HI_REG:
case TMA_HI_REG:
return mode == SImode;
case VCC_HI_REG:
return false;
case EXEC_HI_REG:
return mode == SImode /*|| mode == V32BImode */ ;
case SCC_REG:
case VCCZ_REG:
case EXECZ_REG:
return mode == BImode;
}
if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
return true;
if (SGPR_REGNO_P (regno))
/* We restrict double register values to aligned registers. */
return (sgpr_1reg_mode_p (mode)
|| (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
|| (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
if (VGPR_REGNO_P (regno))
/* Vector instructions do not care about the alignment of register
pairs, but where there is no 64-bit instruction, many of the
define_split do not work if the input and output registers partially
overlap. We tried to fix this with early clobber and match
constraints, but it was bug prone, added complexity, and conflicts
with the 'U0' constraints on vec_merge.
Therefore, we restrict ourselved to aligned registers. */
return (vgpr_1reg_mode_p (mode)
|| (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
/* TImode is used by DImode compare_and_swap. */
|| (mode == TImode
&& !((regno - FIRST_VGPR_REG) & 3)));
return false;
}
/* Implement REGNO_REG_CLASS via gcn.h.
Return smallest class containing REGNO. */
enum reg_class
gcn_regno_reg_class (int regno)
{
switch (regno)
{
case SCC_REG:
return SCC_CONDITIONAL_REG;
case VCC_LO_REG:
case VCC_HI_REG:
return VCC_CONDITIONAL_REG;
case VCCZ_REG:
return VCCZ_CONDITIONAL_REG;
case EXECZ_REG:
return EXECZ_CONDITIONAL_REG;
case EXEC_LO_REG:
case EXEC_HI_REG:
return EXEC_MASK_REG;
}
if (VGPR_REGNO_P (regno))
return VGPR_REGS;
if (SGPR_REGNO_P (regno))
return SGPR_REGS;
if (regno < FIRST_VGPR_REG)
return GENERAL_REGS;
if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
return AFP_REGS;
return ALL_REGS;
}
/* Implement TARGET_CAN_CHANGE_MODE_CLASS.
GCC assumes that lowpart contains first part of value as stored in memory.
This is not the case for vector registers. */
bool
gcn_can_change_mode_class (machine_mode from, machine_mode to,
reg_class_t regclass)
{
if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
return true;
return (gcn_class_max_nregs (regclass, from)
== gcn_class_max_nregs (regclass, to));
}
/* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
When this hook returns true for MODE, the compiler allows
registers explicitly used in the rtl to be used as spill registers
but prevents the compiler from extending the lifetime of these
registers. */
bool
gcn_small_register_classes_for_mode_p (machine_mode mode)
{
/* We allocate into exec and vcc regs. Those make small register class. */
return mode == DImode || mode == SImode;
}
/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
Returns true if pseudos that have been assigned to registers of class RCLASS
would likely be spilled because registers of RCLASS are needed for spill
registers. */
static bool
gcn_class_likely_spilled_p (reg_class_t rclass)
{
return (rclass == EXEC_MASK_REG
|| reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
}
/* Implement TARGET_MODES_TIEABLE_P.
Returns true if a value of MODE1 is accessible in MODE2 without
copying. */
bool
gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
{
return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
&& GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
}
/* Implement TARGET_TRULY_NOOP_TRUNCATION.
Returns true if it is safe to “convert” a value of INPREC bits to one of
OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
it as if it had only OUTPREC bits. */
bool
gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
{
return ((inprec <= 32) && (outprec <= inprec));
}
/* Return N-th part of value occupying multiple registers. */
rtx
gcn_operand_part (machine_mode mode, rtx op, int n)
{
if (GET_MODE_SIZE (mode) >= 256)
{
/*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
if (REG_P (op))
{
gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
return gen_rtx_REG (V64SImode, REGNO (op) + n);
}
if (GET_CODE (op) == CONST_VECTOR)
{
int units = GET_MODE_NUNITS (mode);
rtvec v = rtvec_alloc (units);
for (int i = 0; i < units; ++i)
RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
CONST_VECTOR_ELT (op, i), n);
return gen_rtx_CONST_VECTOR (V64SImode, v);
}
if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
return gcn_gen_undef (V64SImode);
gcc_unreachable ();
}
else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
{
gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
return gen_rtx_REG (SImode, REGNO (op) + n);
}
else
{
if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
return gcn_gen_undef (SImode);
/* If it's a constant then let's assume it is of the largest mode
available, otherwise simplify_gen_subreg will fail. */
if (mode == VOIDmode && CONST_INT_P (op))
mode = DImode;
return simplify_gen_subreg (SImode, op, mode, n * 4);
}
}
/* Return N-th part of value occupying multiple registers. */
rtx
gcn_operand_doublepart (machine_mode mode, rtx op, int n)
{
return simplify_gen_subreg (DImode, op, mode, n * 8);
}
/* Return true if OP can be split into subregs or high/low parts.
This is always true for scalars, but not normally true for vectors.
However, for vectors in hardregs we can use the low and high registers. */
bool
gcn_can_split_p (machine_mode, rtx op)
{
if (vgpr_vector_mode_p (GET_MODE (op)))
{
if (GET_CODE (op) == SUBREG)
op = SUBREG_REG (op);
if (!REG_P (op))
return true;
return REGNO (op) <= FIRST_PSEUDO_REGISTER;
}
return true;
}
/* Implement TARGET_SPILL_CLASS.
Return class of registers which could be used for pseudo of MODE
and of class RCLASS for spilling instead of memory. Return NO_REGS
if it is not possible or non-profitable. */
static reg_class_t
gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
{
if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
|| c == VCC_CONDITIONAL_REG)
return SGPR_REGS;
else
return NO_REGS;
}
/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
Change allocno class for given pseudo from allocno and best class
calculated by IRA. */
static reg_class_t
gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
reg_class_t best_cl)
{
/* Avoid returning classes that contain both vgpr and sgpr registers. */
if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
return cl;
if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
&& best_cl != ALL_GPR_REGS)
return best_cl;
machine_mode mode = PSEUDO_REGNO_MODE (regno);
if (vgpr_vector_mode_p (mode))
return VGPR_REGS;
return GENERAL_REGS;
}
/* Create a new DImode pseudo reg and emit an instruction to initialize
it to VAL. */
static rtx
get_exec (int64_t val)
{
rtx reg = gen_reg_rtx (DImode);
emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
return reg;
}
/* Return value of scalar exec register. */
rtx
gcn_scalar_exec ()
{
return const1_rtx;
}
/* Return pseudo holding scalar exec register. */
rtx
gcn_scalar_exec_reg ()
{
return get_exec (1);
}
/* Return value of full exec register. */
rtx
gcn_full_exec ()
{
return constm1_rtx;
}
/* Return pseudo holding full exec register. */
rtx
gcn_full_exec_reg ()
{
return get_exec (-1);
}
/* }}} */
/* {{{ Immediate constants. */
/* Initialize shared numeric constants. */
static void
init_ext_gcn_constants (void)
{
real_from_integer (&dconst4, DFmode, 4, SIGNED);
/* FIXME: this constant probably does not match what hardware really loads.
Reality check it eventually. */
real_from_string (&dconst1over2pi,
"0.1591549430918953357663423455968866839");
real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
ext_gcn_constants_init = 1;
}
/* Return non-zero if X is a constant that can appear as an inline operand.
This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
Or a vector of those.
The value returned should be the encoding of this constant. */
int
gcn_inline_fp_constant_p (rtx x, bool allow_vector)
{
machine_mode mode = GET_MODE (x);
if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
&& allow_vector)
{
int n;
if (GET_CODE (x) != CONST_VECTOR)
return 0;
n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
if (!n)
return 0;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return 0;
return 1;
}
if (mode != HFmode && mode != SFmode && mode != DFmode)
return 0;
const REAL_VALUE_TYPE *r;
if (x == CONST0_RTX (mode))
return 128;
if (x == CONST1_RTX (mode))
return 242;
r = CONST_DOUBLE_REAL_VALUE (x);
if (real_identical (r, &dconstm1))
return 243;
if (real_identical (r, &dconsthalf))
return 240;
if (real_identical (r, &dconstm1))
return 243;
if (real_identical (r, &dconst2))
return 244;
if (real_identical (r, &dconst4))
return 246;
if (real_identical (r, &dconst1over2pi))
return 248;
if (!ext_gcn_constants_init)
init_ext_gcn_constants ();
real_value_negate (r);
if (real_identical (r, &dconsthalf))
return 241;
if (real_identical (r, &dconst2))
return 245;
if (real_identical (r, &dconst4))
return 247;
/* FIXME: add 4, -4 and 1/(2*PI). */
return 0;
}
/* Return non-zero if X is a constant that can appear as an immediate operand.
This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
Or a vector of those.
The value returned should be the encoding of this constant. */
bool
gcn_fp_constant_p (rtx x, bool allow_vector)
{
machine_mode mode = GET_MODE (x);
if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
&& allow_vector)
{
int n;
if (GET_CODE (x) != CONST_VECTOR)
return false;
n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
if (!n)
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return true;
}
if (mode != HFmode && mode != SFmode && mode != DFmode)
return false;
if (gcn_inline_fp_constant_p (x, false))
return true;
/* FIXME: It is not clear how 32bit immediates are interpreted here. */
return (mode != DFmode);
}
/* Return true if X is a constant representable as an inline immediate
constant in a 32-bit instruction encoding. */
bool
gcn_inline_constant_p (rtx x)
{
if (GET_CODE (x) == CONST_INT)
return INTVAL (x) >= -16 && INTVAL (x) <= 64;
if (GET_CODE (x) == CONST_DOUBLE)
return gcn_inline_fp_constant_p (x, false);
if (GET_CODE (x) == CONST_VECTOR)
{
int n;
if (!vgpr_vector_mode_p (GET_MODE (x)))
return false;
n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
if (!n)
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return 1;
}
return false;
}
/* Return true if X is a constant representable as an immediate constant
in a 32 or 64-bit instruction encoding. */
bool
gcn_constant_p (rtx x)
{
switch (GET_CODE (x))
{
case CONST_INT:
return true;
case CONST_DOUBLE:
return gcn_fp_constant_p (x, false);
case CONST_VECTOR:
{
int n;
if (!vgpr_vector_mode_p (GET_MODE (x)))
return false;
n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
if (!n)
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return true;
}
case SYMBOL_REF:
case LABEL_REF:
return true;
default:
;
}
return false;
}
/* Return true if X is a constant representable as two inline immediate
constants in a 64-bit instruction that is split into two 32-bit
instructions.
When MIXED is set, the low-part is permitted to use the full 32-bits. */
bool
gcn_inline_constant64_p (rtx x, bool mixed)
{
if (GET_CODE (x) == CONST_VECTOR)
{
if (!vgpr_vector_mode_p (GET_MODE (x)))
return false;
if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return true;
}
if (GET_CODE (x) != CONST_INT)
return false;
rtx val_lo = gcn_operand_part (DImode, x, 0);
rtx val_hi = gcn_operand_part (DImode, x, 1);
return ((mixed || gcn_inline_constant_p (val_lo))
&& gcn_inline_constant_p (val_hi));
}
/* Return true if X is a constant representable as an immediate constant
in a 32 or 64-bit instruction encoding where the hardware will
extend the immediate to 64-bits. */
bool
gcn_constant64_p (rtx x)
{
if (!gcn_constant_p (x))
return false;
if (GET_CODE (x) != CONST_INT)
return true;
/* Negative numbers are only allowed if they can be encoded within src0,
because the 32-bit immediates do not get sign-extended.
Unsigned numbers must not be encodable as 32-bit -1..-16, because the
assembler will use a src0 inline immediate and that will get
sign-extended. */
HOST_WIDE_INT val = INTVAL (x);
return (((val & 0xffffffff) == val /* Positive 32-bit. */
&& (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
|| gcn_inline_constant_p (x)); /* Src0. */
}
/* Implement TARGET_LEGITIMATE_CONSTANT_P.
Returns true if X is a legitimate constant for a MODE immediate operand. */
bool
gcn_legitimate_constant_p (machine_mode, rtx x)
{
return gcn_constant_p (x);
}
/* Return true if X is a CONST_VECTOR of single constant. */
static bool
single_cst_vector_p (rtx x)
{
if (GET_CODE (x) != CONST_VECTOR)
return false;
for (int i = 1; i < 64; i++)
if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
return false;
return true;
}
/* Create a CONST_VECTOR of duplicated value A. */
rtx
gcn_vec_constant (machine_mode mode, int a)
{
/*if (!a)
return CONST0_RTX (mode);
if (a == -1)
return CONSTM1_RTX (mode);
if (a == 1)
return CONST1_RTX (mode);
if (a == 2)
return CONST2_RTX (mode);*/
int units = GET_MODE_NUNITS (mode);
machine_mode innermode = GET_MODE_INNER (mode);
rtx tem;
if (FLOAT_MODE_P (innermode))
{
REAL_VALUE_TYPE rv;
real_from_integer (&rv, NULL, a, SIGNED);
tem = const_double_from_real_value (rv, innermode);
}
else
tem = gen_int_mode (a, innermode);
rtvec v = rtvec_alloc (units);
for (int i = 0; i < units; ++i)
RTVEC_ELT (v, i) = tem;
return gen_rtx_CONST_VECTOR (mode, v);
}
/* Create a CONST_VECTOR of duplicated value A. */
rtx
gcn_vec_constant (machine_mode mode, rtx a)
{
int units = GET_MODE_NUNITS (mode);
rtvec v = rtvec_alloc (units);
for (int i = 0; i < units; ++i)
RTVEC_ELT (v, i) = a;
return gen_rtx_CONST_VECTOR (mode, v);
}
/* Create an undefined vector value, used where an insn operand is
optional. */
rtx
gcn_gen_undef (machine_mode mode)
{
return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
}
/* }}} */
/* {{{ Addresses, pointers and moves. */
/* Return true is REG is a valid place to store a pointer,
for instructions that require an SGPR.
FIXME rename. */
static bool
gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
{
if (GET_CODE (reg) == SUBREG)
reg = SUBREG_REG (reg);
if (!REG_P (reg))
return false;
if (GET_MODE (reg) != mode)
return false;
int regno = REGNO (reg);
if (regno >= FIRST_PSEUDO_REGISTER)
{
if (!strict)
return true;
if (!reg_renumber)
return false;
regno = reg_renumber[regno];
}
return (SGPR_REGNO_P (regno) || regno == M0_REG
|| regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
}
/* Return true is REG is a valid place to store a pointer,
for instructions that require a VGPR. */
static bool
gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
{
if (GET_CODE (reg) == SUBREG)
reg = SUBREG_REG (reg);
if (!REG_P (reg))
return false;
if (GET_MODE (reg) != mode)
return false;
int regno = REGNO (reg);
if (regno >= FIRST_PSEUDO_REGISTER)
{
if (!strict)
return true;
if (!reg_renumber)
return false;
regno = reg_renumber[regno];
}
return VGPR_REGNO_P (regno);
}
/* Return true if X would be valid inside a MEM using the Flat address
space. */
bool
gcn_flat_address_p (rtx x, machine_mode mode)
{
bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
if (vec_mode && gcn_address_register_p (x, DImode, false))
return true;
if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
return true;
if (TARGET_GCN5_PLUS
&& GET_CODE (x) == PLUS
&& gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
&& CONST_INT_P (XEXP (x, 1)))
return true;
return false;
}
/* Return true if X would be valid inside a MEM using the Scalar Flat
address space. */
bool
gcn_scalar_flat_address_p (rtx x)
{
if (gcn_address_register_p (x, DImode, false))
return true;
if (GET_CODE (x) == PLUS
&& gcn_address_register_p (XEXP (x, 0), DImode, false)
&& CONST_INT_P (XEXP (x, 1)))
return true;
return false;
}
/* Return true if MEM X would be valid for the Scalar Flat address space. */
bool
gcn_scalar_flat_mem_p (rtx x)
{
if (!MEM_P (x))
return false;
if (GET_MODE_SIZE (GET_MODE (x)) < 4)
return false;
return gcn_scalar_flat_address_p (XEXP (x, 0));
}
/* Return true if X would be valid inside a MEM using the LDS or GDS
address spaces. */
bool
gcn_ds_address_p (rtx x)
{
if (gcn_vec_address_register_p (x, SImode, false))
return true;
if (GET_CODE (x) == PLUS
&& gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
&& CONST_INT_P (XEXP (x, 1)))
return true;
return false;
}
/* Return true if ADDR would be valid inside a MEM using the Global
address space. */
bool
gcn_global_address_p (rtx addr)
{
if (gcn_address_register_p (addr, DImode, false)
|| gcn_vec_address_register_p (addr, DImode, false))
return true;
if (GET_CODE (addr) == PLUS)
{
rtx base = XEXP (addr, 0);
rtx offset = XEXP (addr, 1);
bool immediate_p = (CONST_INT_P (offset)
&& INTVAL (offset) >= -(1 << 12)
&& INTVAL (offset) < (1 << 12));
if ((gcn_address_register_p (base, DImode, false)
|| gcn_vec_address_register_p (base, DImode, false))
&& immediate_p)
/* SGPR + CONST or VGPR + CONST */
return true;
if (gcn_address_register_p (base, DImode, false)
&& gcn_vgpr_register_operand (offset, SImode))
/* SPGR + VGPR */
return true;
if (GET_CODE (base) == PLUS
&& gcn_address_register_p (XEXP (base, 0), DImode, false)
&& gcn_vgpr_register_operand (XEXP (base, 1), SImode)
&& immediate_p)
/* (SGPR + VGPR) + CONST */
return true;
}
return false;
}
/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
Recognizes RTL expressions that are valid memory addresses for an
instruction. The MODE argument is the machine mode for the MEM
expression that wants to use this address.
It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
convert common non-canonical forms to canonical form so that they will
be recognized. */
static bool
gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
addr_space_t as)
{
/* All vector instructions need to work on addresses in registers. */
if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
return false;
if (AS_SCALAR_FLAT_P (as))
{
if (mode == QImode || mode == HImode)
return 0;
switch (GET_CODE (x))
{
case REG:
return gcn_address_register_p (x, DImode, strict);
/* Addresses are in the form BASE+OFFSET
OFFSET is either 20bit unsigned immediate, SGPR or M0.
Writes and atomics do not accept SGPR. */
case PLUS:
{
rtx x0 = XEXP (x, 0);
rtx x1 = XEXP (x, 1);
if (!gcn_address_register_p (x0, DImode, strict))
return false;
/* FIXME: This is disabled because of the mode mismatch between
SImode (for the address or m0 register) and the DImode PLUS.
We'll need a zero_extend or similar.
if (gcn_m0_register_p (x1, SImode, strict)
|| gcn_address_register_p (x1, SImode, strict))
return true;
else*/
if (GET_CODE (x1) == CONST_INT)
{
if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
/* The low bits of the offset are ignored, even when
they're meant to realign the pointer. */
&& !(INTVAL (x1) & 0x3))
return true;
}
return false;
}
default:
break;
}
}
else if (AS_SCRATCH_P (as))
return gcn_address_register_p (x, SImode, strict);
else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
{
if (TARGET_GCN3 || GET_CODE (x) == REG)
return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
? gcn_address_register_p (x, DImode, strict)
: gcn_vec_address_register_p (x, DImode, strict));
else
{
gcc_assert (TARGET_GCN5_PLUS);
if (GET_CODE (x) == PLUS)
{
rtx x1 = XEXP (x, 1);
if (VECTOR_MODE_P (mode)
? !gcn_address_register_p (x, DImode, strict)
: !gcn_vec_address_register_p (x, DImode, strict))
return false;
if (GET_CODE (x1) == CONST_INT)
{
if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
/* The low bits of the offset are ignored, even when
they're meant to realign the pointer. */
&& !(INTVAL (x1) & 0x3))
return true;
}
}
return false;
}
}
else if (AS_GLOBAL_P (as))
{
gcc_assert (TARGET_GCN5_PLUS);
if (GET_CODE (x) == REG)
return (gcn_address_register_p (x, DImode, strict)
|| (!VECTOR_MODE_P (mode)
&& gcn_vec_address_register_p (x, DImode, strict)));
else if (GET_CODE (x) == PLUS)
{
rtx base = XEXP (x, 0);
rtx offset = XEXP (x, 1);
bool immediate_p = (GET_CODE (offset) == CONST_INT
/* Signed 13-bit immediate. */
&& INTVAL (offset) >= -(1 << 12)
&& INTVAL (offset) < (1 << 12)
/* The low bits of the offset are ignored, even
when they're meant to realign the pointer. */
&& !(INTVAL (offset) & 0x3));
if (!VECTOR_MODE_P (mode))
{
if ((gcn_address_register_p (base, DImode, strict)
|| gcn_vec_address_register_p (base, DImode, strict))
&& immediate_p)
/* SGPR + CONST or VGPR + CONST */
return true;
if (gcn_address_register_p (base, DImode, strict)
&& gcn_vgpr_register_operand (offset, SImode))
/* SGPR + VGPR */
return true;
if (GET_CODE (base) == PLUS
&& gcn_address_register_p (XEXP (base, 0), DImode, strict)
&& gcn_vgpr_register_operand (XEXP (base, 1), SImode)
&& immediate_p)
/* (SGPR + VGPR) + CONST */
return true;
}
else
{
if (gcn_address_register_p (base, DImode, strict)
&& immediate_p)
/* SGPR + CONST */
return true;
}
}
else
return false;
}
else if (AS_ANY_DS_P (as))
switch (GET_CODE (x))
{
case REG:
return (VECTOR_MODE_P (mode)
? gcn_address_register_p (x, SImode, strict)
: gcn_vec_address_register_p (x, SImode, strict));
/* Addresses are in the form BASE+OFFSET
OFFSET is either 20bit unsigned immediate, SGPR or M0.
Writes and atomics do not accept SGPR. */
case PLUS:
{
rtx x0 = XEXP (x, 0);
rtx x1 = XEXP (x, 1);
if (!gcn_vec_address_register_p (x0, DImode, strict))
return false;
if (GET_CODE (x1) == REG)
{
if (GET_CODE (x1) != REG
|| (REGNO (x1) <= FIRST_PSEUDO_REGISTER
&& !gcn_ssrc_register_operand (x1, DImode)))
return false;
}
else if (GET_CODE (x1) == CONST_VECTOR
&& GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
&& single_cst_vector_p (x1))
{
x1 = CONST_VECTOR_ELT (x1, 0);
if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
return true;
}
return false;
}
default:
break;
}
else
gcc_unreachable ();
return false;
}
/* Implement TARGET_ADDR_SPACE_POINTER_MODE.
Return the appropriate mode for a named address pointer. */
static scalar_int_mode
gcn_addr_space_pointer_mode (addr_space_t addrspace)
{
switch (addrspace)
{
case ADDR_SPACE_SCRATCH:
case ADDR_SPACE_LDS:
case ADDR_SPACE_GDS:
return SImode;
case ADDR_SPACE_DEFAULT:
case ADDR_SPACE_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
case ADDR_SPACE_SCALAR_FLAT:
return DImode;
default:
gcc_unreachable ();
}
}
/* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
Return the appropriate mode for a named address space address. */
static scalar_int_mode
gcn_addr_space_address_mode (addr_space_t addrspace)
{
return gcn_addr_space_pointer_mode (addrspace);
}
/* Implement TARGET_ADDR_SPACE_SUBSET_P.
Determine if one named address space is a subset of another. */
static bool
gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
{
if (subset == superset)
return true;
/* FIXME is this true? */
if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
return true;
return false;
}
/* Convert from one address space to another. */
static rtx
gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
{
gcc_assert (POINTER_TYPE_P (from_type));
gcc_assert (POINTER_TYPE_P (to_type));
addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
{
rtx queue = gen_rtx_REG (DImode,
cfun->machine->args.reg[QUEUE_PTR_ARG]);
rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
gen_rtx_PLUS (DImode, queue,
gen_int_mode (64, SImode)));
rtx tmp = gen_reg_rtx (DImode);
emit_move_insn (gen_lowpart (SImode, tmp), op);
emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
group_seg_aperture_hi);
return tmp;
}
else if (as_from == as_to)
return op;
else
gcc_unreachable ();
}
/* Implement TARGET_ADDR_SPACE_DEBUG.
Return the dwarf address space class for each hardware address space. */
static int
gcn_addr_space_debug (addr_space_t as)
{
switch (as)
{
case ADDR_SPACE_DEFAULT:
case ADDR_SPACE_FLAT:
case ADDR_SPACE_SCALAR_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
return DW_ADDR_none;
case ADDR_SPACE_GLOBAL:
return 1; // DW_ADDR_LLVM_global
case ADDR_SPACE_LDS:
return 3; // DW_ADDR_LLVM_group
case ADDR_SPACE_SCRATCH:
return 4; // DW_ADDR_LLVM_private
case ADDR_SPACE_GDS:
return 0x8000; // DW_ADDR_AMDGPU_region
}
gcc_unreachable ();
}
/* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
Retun true if REGNO is OK for memory adressing. */
bool
gcn_regno_mode_code_ok_for_base_p (int regno,
machine_mode, addr_space_t as, int, int)
{
if (regno >= FIRST_PSEUDO_REGISTER)
{
if (reg_renumber)
regno = reg_renumber[regno];
else
return true;
}
if (AS_FLAT_P (as))
return (VGPR_REGNO_P (regno)
|| regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
else if (AS_SCALAR_FLAT_P (as))
return (SGPR_REGNO_P (regno)
|| regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
else if (AS_GLOBAL_P (as))
{
return (SGPR_REGNO_P (regno)
|| VGPR_REGNO_P (regno)
|| regno == ARG_POINTER_REGNUM
|| regno == FRAME_POINTER_REGNUM);
}
else
/* For now. */
return false;
}
/* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
Return a suitable register class for memory addressing. */
reg_class
gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
int ic)
{
switch (as)
{
case ADDR_SPACE_DEFAULT:
return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
case ADDR_SPACE_SCALAR_FLAT:
case ADDR_SPACE_SCRATCH:
return SGPR_REGS;
break;
case ADDR_SPACE_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
case ADDR_SPACE_LDS:
case ADDR_SPACE_GDS:
return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
? SGPR_REGS : VGPR_REGS);
case ADDR_SPACE_GLOBAL:
return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
? SGPR_REGS : ALL_GPR_REGS);
}
gcc_unreachable ();
}
/* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
Return true if REGNO is OK for index of memory addressing. */
bool
regno_ok_for_index_p (int regno)
{
if (regno >= FIRST_PSEUDO_REGISTER)
{
if (reg_renumber)
regno = reg_renumber[regno];
else
return true;
}
return regno == M0_REG || VGPR_REGNO_P (regno);
}
/* Generate move which uses the exec flags. If EXEC is NULL, then it is
assumed that all lanes normally relevant to the mode of the move are
affected. If PREV is NULL, then a sensible default is supplied for
the inactive lanes. */
static rtx
gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
{
machine_mode mode = GET_MODE (op0);
if (vgpr_vector_mode_p (mode))
{
if (exec && exec != CONSTM1_RTX (DImode))
{
if (!prev)
prev = op0;
}
else
{
if (!prev)
prev = gcn_gen_undef (mode);
exec = gcn_full_exec_reg ();
}
rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
return gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (2, set,
gen_rtx_CLOBBER (VOIDmode,
gen_rtx_SCRATCH (V64DImode))));
}
return (gen_rtx_PARALLEL
(VOIDmode,
gen_rtvec (2, gen_rtx_SET (op0, op1),
gen_rtx_USE (VOIDmode,
exec ? exec : gcn_scalar_exec ()))));
}
/* Generate masked move. */
static rtx
gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
{
if (exec)
return (gen_rtx_SET (op0,
gen_rtx_VEC_MERGE (GET_MODE (op0),
gen_rtx_VEC_DUPLICATE (GET_MODE
(op0), op1),
op2, exec)));
else
return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
}
/* Expand vector init of OP0 by VEC.
Implements vec_init instruction pattern. */
void
gcn_expand_vector_init (rtx op0, rtx vec)
{
int64_t initialized_mask = 0;
int64_t curr_mask = 1;
machine_mode mode = GET_MODE (op0);
rtx val = XVECEXP (vec, 0, 0);
for (int i = 1; i < 64; i++)
if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
curr_mask |= (int64_t) 1 << i;
if (gcn_constant_p (val))
emit_move_insn (op0, gcn_vec_constant (mode, val));
else
{
val = force_reg (GET_MODE_INNER (mode), val);
emit_insn (gen_duplicate_load (op0, val));
}
initialized_mask |= curr_mask;
for (int i = 1; i < 64; i++)
if (!(initialized_mask & ((int64_t) 1 << i)))
{
curr_mask = (int64_t) 1 << i;
rtx val = XVECEXP (vec, 0, i);
for (int j = i + 1; j < 64; j++)
if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
curr_mask |= (int64_t) 1 << j;
if (gcn_constant_p (val))
emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
get_exec (curr_mask)));
else
{
val = force_reg (GET_MODE_INNER (mode), val);
emit_insn (gen_duplicate_load (op0, val, op0,
get_exec (curr_mask)));
}
initialized_mask |= curr_mask;
}
}
/* Load vector constant where n-th lane contains BASE+n*VAL. */
static rtx
strided_constant (machine_mode mode, int base, int val)
{
rtx x = gen_reg_rtx (mode);
emit_move_insn (x, gcn_vec_constant (mode, base));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
x, get_exec (0xffffffff00000000)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
x, get_exec (0xffff0000ffff0000)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
x, get_exec (0xff00ff00ff00ff00)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
x, get_exec (0xf0f0f0f0f0f0f0f0)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
x, get_exec (0xcccccccccccccccc)));
emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
x, get_exec (0xaaaaaaaaaaaaaaaa)));
return x;
}
/* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
static rtx
gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
addr_space_t as)
{
switch (as)
{
case ADDR_SPACE_DEFAULT:
return gcn_addr_space_legitimize_address (x, old, mode,
DEFAULT_ADDR_SPACE);
case ADDR_SPACE_SCALAR_FLAT:
case ADDR_SPACE_SCRATCH:
/* Instructions working on vectors need the address to be in
a register. */
if (vgpr_vector_mode_p (mode))
return force_reg (GET_MODE (x), x);
return x;
case ADDR_SPACE_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
case ADDR_SPACE_GLOBAL:
return TARGET_GCN3 ? force_reg (DImode, x) : x;
case ADDR_SPACE_LDS:
case ADDR_SPACE_GDS:
/* FIXME: LDS support offsets, handle them!. */
if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
{
rtx addrs = gen_reg_rtx (V64SImode);
rtx base = force_reg (SImode, x);
rtx offsets = strided_constant (V64SImode, 0,
GET_MODE_UNIT_SIZE (mode));
emit_insn (gen_vec_duplicatev64si (addrs, base));
emit_insn (gen_addv64si3 (addrs, offsets, addrs));
return addrs;
}
return x;
}
gcc_unreachable ();
}
/* Convert a (mem: (reg:DI)) to (mem: (reg:V64DI)) with the
proper vector of stepped addresses.
MEM will be a DImode address of a vector in an SGPR.
TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
rtx
gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
rtx tmp)
{
gcc_assert (MEM_P (mem));
rtx mem_base = XEXP (mem, 0);
rtx mem_index = NULL_RTX;
if (!TARGET_GCN5_PLUS)
{
/* gcn_addr_space_legitimize_address should have put the address in a
register. If not, it is too late to do anything about it. */
gcc_assert (REG_P (mem_base));
}
if (GET_CODE (mem_base) == PLUS)
{
mem_index = XEXP (mem_base, 1);
mem_base = XEXP (mem_base, 0);
}
/* RF and RM base registers for vector modes should be always an SGPR. */
gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
|| REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
machine_mode inner = GET_MODE_INNER (mode);
int shift = exact_log2 (GET_MODE_SIZE (inner));
rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
rtx undef_v64si = gcn_gen_undef (V64SImode);
rtx new_base = NULL_RTX;
addr_space_t as = MEM_ADDR_SPACE (mem);
rtx tmplo = (REG_P (tmp)
? gcn_operand_part (V64DImode, tmp, 0)
: gen_reg_rtx (V64SImode));
/* tmplo[:] = ramp[:] << shift */
if (exec)
emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
gen_int_mode (shift, SImode),
undef_v64si, exec));
else
emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
if (AS_FLAT_P (as))
{
rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
if (REG_P (tmp))
{
rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
/* tmphi[:] = mem_base_hi */
if (exec)
emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
undef_v64si, exec));
else
emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
/* tmp[:] += zext (mem_base) */
if (exec)
{
emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
vcc, undef_v64si, exec));
emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
vcc, vcc, undef_v64si, exec));
}
else
emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
}
else
{
tmp = gen_reg_rtx (V64DImode);
if (exec)
emit_insn (gen_addv64di3_vcc_zext_dup2_exec
(tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
exec));
else
emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
}
new_base = tmp;
}
else if (AS_ANY_DS_P (as))
{
if (!exec)
emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
else
emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
gcn_gen_undef (V64SImode), exec));
new_base = tmplo;
}
else
{
mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
new_base = gen_rtx_PLUS (V64DImode, mem_base,
gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
}
return gen_rtx_PLUS (GET_MODE (new_base), new_base,
gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
(mem_index ? mem_index
: const0_rtx)));
}
/* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
suitable for the given address space. This is indented for use in
gather/scatter patterns.
The offsets may be signed or unsigned, according to UNSIGNED_P.
If EXEC is set then _exec patterns will be used, otherwise plain.
Return values.
ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses.
ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */
rtx
gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
bool unsigned_p, rtx exec)
{
rtx tmpsi = gen_reg_rtx (V64SImode);
rtx tmpdi = gen_reg_rtx (V64DImode);
rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
if (CONST_INT_P (scale)
&& INTVAL (scale) > 0
&& exact_log2 (INTVAL (scale)) >= 0)
emit_insn (gen_ashlv64si3 (tmpsi, offsets,
GEN_INT (exact_log2 (INTVAL (scale)))));
else
(exec
? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
exec))
: emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
/* "Global" instructions do not support negative register offsets. */
if (as == ADDR_SPACE_FLAT || !unsigned_p)
{
if (unsigned_p)
(exec
? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
undefdi, exec))
: emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
else
(exec
? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
undefdi, exec))
: emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
return tmpdi;
}
else if (as == ADDR_SPACE_GLOBAL)
return tmpsi;
gcc_unreachable ();
}
/* Return true if move from OP0 to OP1 is known to be executed in vector
unit. */
bool
gcn_vgpr_move_p (rtx op0, rtx op1)
{
if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
return true;
if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
return true;
return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
|| (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
|| vgpr_vector_mode_p (GET_MODE (op0)));
}
/* Return true if move from OP0 to OP1 is known to be executed in scalar
unit. Used in the machine description. */
bool
gcn_sgpr_move_p (rtx op0, rtx op1)
{
if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
return true;
if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
return true;
if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
|| VGPR_REGNO_P (REGNO (op0)))
return false;
if (REG_P (op1)
&& REGNO (op1) < FIRST_PSEUDO_REGISTER
&& !VGPR_REGNO_P (REGNO (op1)))
return true;
return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
}
/* Implement TARGET_SECONDARY_RELOAD.
The address space determines which registers can be used for loads and
stores. */
static reg_class_t
gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
machine_mode reload_mode, secondary_reload_info *sri)
{
reg_class_t result = NO_REGS;
bool spilled_pseudo =
(REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "gcn_secondary_reload: ");
dump_value_slim (dump_file, x, 1);
fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
reg_class_names[rclass], GET_MODE_NAME (reload_mode));
if (REG_P (x) || GET_CODE (x) == SUBREG)
fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
(true_regnum (x) >= 0
&& true_regnum (x) < FIRST_PSEUDO_REGISTER
? reg_names[true_regnum (x)]
: (spilled_pseudo ? "stack spill" : "??")));
fprintf (dump_file, "\n");
}
/* Some callers don't use or initialize icode. */
sri->icode = CODE_FOR_nothing;
if (MEM_P (x) || spilled_pseudo)
{
addr_space_t as = DEFAULT_ADDR_SPACE;
/* If we have a spilled pseudo, we can't find the address space
directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
ADDR_SPACE_GLOBAL for GCN5. */
if (MEM_P (x))
as = MEM_ADDR_SPACE (x);
if (as == ADDR_SPACE_DEFAULT)
as = DEFAULT_ADDR_SPACE;
switch (as)
{
case ADDR_SPACE_SCALAR_FLAT:
result =
((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
break;
case ADDR_SPACE_FLAT:
case ADDR_SPACE_FLAT_SCRATCH:
case ADDR_SPACE_GLOBAL:
if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
|| GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
{
if (in_p)
switch (reload_mode)
{
case E_V64SImode:
sri->icode = CODE_FOR_reload_inv64si;
break;
case E_V64SFmode:
sri->icode = CODE_FOR_reload_inv64sf;
break;
case E_V64HImode:
sri->icode = CODE_FOR_reload_inv64hi;
break;
case E_V64HFmode:
sri->icode = CODE_FOR_reload_inv64hf;
break;
case E_V64QImode:
sri->icode = CODE_FOR_reload_inv64qi;
break;
case E_V64DImode:
sri->icode = CODE_FOR_reload_inv64di;
break;
case E_V64DFmode:
sri->icode = CODE_FOR_reload_inv64df;
break;
default:
gcc_unreachable ();
}
else
switch (reload_mode)
{
case E_V64SImode:
sri->icode = CODE_FOR_reload_outv64si;
break;
case E_V64SFmode:
sri->icode = CODE_FOR_reload_outv64sf;
break;
case E_V64HImode:
sri->icode = CODE_FOR_reload_outv64hi;
break;
case E_V64HFmode:
sri->icode = CODE_FOR_reload_outv64hf;
break;
case E_V64QImode:
sri->icode = CODE_FOR_reload_outv64qi;
break;
case E_V64DImode:
sri->icode = CODE_FOR_reload_outv64di;
break;
case E_V64DFmode:
sri->icode = CODE_FOR_reload_outv64df;
break;
default:
gcc_unreachable ();
}
break;
}
/* Fallthrough. */
case ADDR_SPACE_LDS:
case ADDR_SPACE_GDS:
case ADDR_SPACE_SCRATCH:
result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
break;
}
}
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
get_insn_name (sri->icode));
return result;
}
/* Update register usage after having seen the compiler flags and kernel
attributes. We typically want to fix registers that contain values
set by the HSA runtime. */
static void
gcn_conditional_register_usage (void)
{
if (!cfun || !cfun->machine)
return;
if (cfun->machine->normal_function)
{
/* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
i <= LAST_SGPR_REG; i++)
fixed_regs[i] = 1, call_used_regs[i] = 1;
for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
i <= LAST_VGPR_REG; i++)
fixed_regs[i] = 1, call_used_regs[i] = 1;
return;
}
/* If the set of requested args is the default set, nothing more needs to
be done. */
if (cfun->machine->args.requested == default_requested_args)
return;
/* Requesting a set of args different from the default violates the ABI. */
if (!leaf_function_p ())
warning (0, "A non-default set of initial values has been requested, "
"which violates the ABI");
for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
fixed_regs[i] = 0;
/* Fix the runtime argument register containing values that may be
needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
needed after the prologue so there's no need to fix them. */
if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
{
/* The upper 32-bits of the 64-bit descriptor are not used, so allow
the containing registers to be used for other purposes. */
fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
}
if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
{
fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
}
if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
{
fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
}
if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
}
/* Determine if a load or store is valid, according to the register classes
and address space. Used primarily by the machine description to decide
when to split a move into two steps. */
bool
gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
{
if (!MEM_P (dest) && !MEM_P (src))
return true;
if (MEM_P (dest)
&& AS_FLAT_P (MEM_ADDR_SPACE (dest))
&& (gcn_flat_address_p (XEXP (dest, 0), mode)
|| GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (dest, 0)) == LABEL_REF)
&& gcn_vgpr_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& AS_FLAT_P (MEM_ADDR_SPACE (src))
&& (gcn_flat_address_p (XEXP (src, 0), mode)
|| GET_CODE (XEXP (src, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (src, 0)) == LABEL_REF)
&& gcn_vgpr_register_operand (dest, mode))
return true;
if (MEM_P (dest)
&& AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
&& (gcn_global_address_p (XEXP (dest, 0))
|| GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (dest, 0)) == LABEL_REF)
&& gcn_vgpr_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& AS_GLOBAL_P (MEM_ADDR_SPACE (src))
&& (gcn_global_address_p (XEXP (src, 0))
|| GET_CODE (XEXP (src, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (src, 0)) == LABEL_REF)
&& gcn_vgpr_register_operand (dest, mode))
return true;
if (MEM_P (dest)
&& MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
&& (gcn_scalar_flat_address_p (XEXP (dest, 0))
|| GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (dest, 0)) == LABEL_REF)
&& gcn_ssrc_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
&& (gcn_scalar_flat_address_p (XEXP (src, 0))
|| GET_CODE (XEXP (src, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (src, 0)) == LABEL_REF)
&& gcn_sdst_register_operand (dest, mode))
return true;
if (MEM_P (dest)
&& AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
&& gcn_ds_address_p (XEXP (dest, 0))
&& gcn_vgpr_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& AS_ANY_DS_P (MEM_ADDR_SPACE (src))
&& gcn_ds_address_p (XEXP (src, 0))
&& gcn_vgpr_register_operand (dest, mode))
return true;
return false;
}
/* }}} */
/* {{{ Functions and ABI. */
/* Implement TARGET_FUNCTION_VALUE.
Define how to find the value returned by a function.
The register location is always the same, but the mode depends on
VALTYPE. */
static rtx
gcn_function_value (const_tree valtype, const_tree, bool)
{
machine_mode mode = TYPE_MODE (valtype);
if (INTEGRAL_TYPE_P (valtype)
&& GET_MODE_CLASS (mode) == MODE_INT
&& GET_MODE_SIZE (mode) < 4)
mode = SImode;
return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
}
/* Implement TARGET_FUNCTION_VALUE_REGNO_P.
Return true if N is a possible register number for the function return
value. */
static bool
gcn_function_value_regno_p (const unsigned int n)
{
return n == RETURN_VALUE_REG;
}
/* Calculate the number of registers required to hold function argument
ARG. */
static int
num_arg_regs (const function_arg_info &arg)
{
if (targetm.calls.must_pass_in_stack (arg))
return 0;
int size = arg.promoted_size_in_bytes ();
return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
}
/* Implement TARGET_STRICT_ARGUMENT_NAMING.
Return true if the location where a function argument is passed
depends on whether or not it is a named argument
For gcn, we know how to handle functions declared as stdarg: by
passing an extra pointer to the unnamed arguments. However, the
Fortran frontend can produce a different situation, where a
function pointer is declared with no arguments, but the actual
function and calls to it take more arguments. In that case, we
want to ensure the call matches the definition of the function. */
static bool
gcn_strict_argument_naming (cumulative_args_t cum_v)
{
CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
}
/* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
See comment on gcn_strict_argument_naming. */
static bool
gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
{
return !gcn_strict_argument_naming (cum_v);
}
/* Implement TARGET_FUNCTION_ARG.
Return an RTX indicating whether a function argument is passed in a register
and if so, which register. */
static rtx
gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
{
CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
if (cum->normal_function)
{
if (!arg.named || arg.end_marker_p ())
return 0;
if (targetm.calls.must_pass_in_stack (arg))
return 0;
/* Vector parameters are not supported yet. */
if (VECTOR_MODE_P (arg.mode))
return 0;
int reg_num = FIRST_PARM_REG + cum->num;
int num_regs = num_arg_regs (arg);
if (num_regs > 0)
while (reg_num % num_regs != 0)
reg_num++;
if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
return gen_rtx_REG (arg.mode, reg_num);
}
else
{
if (cum->num >= cum->args.nargs)
{
cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
& -(TYPE_ALIGN (arg.type) / 8);
cfun->machine->kernarg_segment_alignment
= MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
TYPE_ALIGN (arg.type) / 8);
rtx addr = gen_rtx_REG (DImode,
cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
if (cum->offset)
addr = gen_rtx_PLUS (DImode, addr,
gen_int_mode (cum->offset, DImode));
rtx mem = gen_rtx_MEM (arg.mode, addr);
set_mem_attributes (mem, arg.type, 1);
set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
MEM_READONLY_P (mem) = 1;
return mem;
}
int a = cum->args.order[cum->num];
if (arg.mode != gcn_kernel_arg_types[a].mode)
{
error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
return 0;
}
return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
cum->args.reg[a]);
}
return 0;
}
/* Implement TARGET_FUNCTION_ARG_ADVANCE.
Updates the summarizer variable pointed to by CUM_V to advance past an
argument in the argument list. */
static void
gcn_function_arg_advance (cumulative_args_t cum_v,
const function_arg_info &arg)
{
CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
if (cum->normal_function)
{
if (!arg.named)
return;
int num_regs = num_arg_regs (arg);
if (num_regs > 0)
while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
cum->num++;
cum->num += num_regs;
}
else
{
if (cum->num < cum->args.nargs)
cum->num++;
else
{
cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
cfun->machine->kernarg_segment_byte_size = cum->offset;
}
}
}
/* Implement TARGET_ARG_PARTIAL_BYTES.
Returns the number of bytes at the beginning of an argument that must be put
in registers. The value must be zero for arguments that are passed entirely
in registers or that are entirely pushed on the stack. */
static int
gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
{
CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
if (!arg.named)
return 0;
if (targetm.calls.must_pass_in_stack (arg))
return 0;
if (cum->num >= NUM_PARM_REGS)
return 0;
/* If the argument fits entirely in registers, return 0. */
if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
return 0;
return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
}
/* A normal function which takes a pointer argument may be passed a pointer to
LDS space (via a high-bits-set aperture), and that only works with FLAT
addressing, not GLOBAL. Force FLAT addressing if the function has an
incoming pointer parameter. NOTE: This is a heuristic that works in the
offloading case, but in general, a function might read global pointer
variables, etc. that may refer to LDS space or other special memory areas
not supported by GLOBAL instructions, and then this argument check would not
suffice. */
static void
gcn_detect_incoming_pointer_arg (tree fndecl)
{
gcc_assert (cfun && cfun->machine);
for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
arg;
arg = TREE_CHAIN (arg))
if (POINTER_TYPE_P (TREE_VALUE (arg)))
cfun->machine->use_flat_addressing = true;
}
/* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
whose data type is FNTYPE. For a library call, FNTYPE is 0. */
void
gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
tree fntype /* tree ptr for function decl */ ,
rtx libname /* SYMBOL_REF of library name or 0 */ ,
tree fndecl, int caller)
{
memset (cum, 0, sizeof (*cum));
cum->fntype = fntype;
if (libname)
{
gcc_assert (cfun && cfun->machine);
cum->normal_function = true;
if (!caller)
{
cfun->machine->normal_function = true;
gcn_detect_incoming_pointer_arg (fndecl);
}
return;
}
tree attr = NULL;
if (fndecl)
attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
if (fndecl && !attr)
attr = lookup_attribute ("amdgpu_hsa_kernel",
TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
if (!attr && fntype)
attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
/* Handle main () as kernel, so we can run testsuite.
Handle OpenACC kernels similarly to main. */
if (!attr && !caller && fndecl
&& (MAIN_NAME_P (DECL_NAME (fndecl))
|| lookup_attribute ("omp target entrypoint",
DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
else
{
if (!attr || caller)
{
gcc_assert (cfun && cfun->machine);
cum->normal_function = true;
if (!caller)
cfun->machine->normal_function = true;
}
gcn_parse_amdgpu_hsa_kernel_attribute
(&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
}
cfun->machine->args = cum->args;
if (!caller && cfun->machine->normal_function)
gcn_detect_incoming_pointer_arg (fndecl);
reinit_regs ();
}
static bool
gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
{
machine_mode mode = TYPE_MODE (type);
HOST_WIDE_INT size = int_size_in_bytes (type);
if (AGGREGATE_TYPE_P (type))
return true;
/* Vector return values are not supported yet. */
if (VECTOR_TYPE_P (type))
return true;
if (mode == BLKmode)
return true;
if (size > 2 * UNITS_PER_WORD)
return true;
return false;
}
/* Implement TARGET_PROMOTE_FUNCTION_MODE.
Return the mode to use for outgoing function arguments. */
machine_mode
gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
int *ARG_UNUSED (punsignedp),
const_tree ARG_UNUSED (funtype),
int ARG_UNUSED (for_return))
{
if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
return SImode;
return mode;
}
/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
ARGS_GROW_DOWNWARDS. */
static tree
gcn_gimplify_va_arg_expr (tree valist, tree type,
gimple_seq *ARG_UNUSED (pre_p),
gimple_seq *ARG_UNUSED (post_p))
{
tree ptr = build_pointer_type (type);
tree valist_type;
tree t, u;
bool indirect;
indirect = pass_va_arg_by_reference (type);
if (indirect)
{
type = ptr;
ptr = build_pointer_type (type);
}
valist_type = TREE_TYPE (valist);
/* Args grow down. Not handled by generic routines. */
u = fold_convert (sizetype, size_in_bytes (type));
u = fold_build1 (NEGATE_EXPR, sizetype, u);
t = fold_build_pointer_plus (valist, u);
/* Align to 8 byte boundary. */
u = build_int_cst (TREE_TYPE (t), -8);
t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
t = fold_convert (valist_type, t);
t = build2 (MODIFY_EXPR, valist_type, valist, t);
t = fold_convert (ptr, t);
t = build_va_arg_indirect_ref (t);
if (indirect)
t = build_va_arg_indirect_ref (t);
return t;
}
/* Return 1 if TRAIT NAME is present in the OpenMP context's
device trait set, return 0 if not present in any OpenMP context in the
whole translation unit, or -1 if not present in the current OpenMP context
but might be present in another OpenMP context in the same TU. */
int
gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
const char *name)
{
switch (trait)
{
case omp_device_kind:
return strcmp (name, "gpu") == 0;
case omp_device_arch:
return strcmp (name, "gcn") == 0;
case omp_device_isa:
if (strcmp (name, "fiji") == 0)
return gcn_arch == PROCESSOR_FIJI;
if (strcmp (name, "gfx900") == 0)
return gcn_arch == PROCESSOR_VEGA10;
if (strcmp (name, "gfx906") == 0)
return gcn_arch == PROCESSOR_VEGA20;
if (strcmp (name, "gfx908") == 0)
return gcn_arch == PROCESSOR_GFX908;
return 0;
default:
gcc_unreachable ();
}
}
/* Calculate stack offsets needed to create prologues and epilogues. */
static struct machine_function *
gcn_compute_frame_offsets (void)
{
machine_function *offsets = cfun->machine;
if (reload_completed)
return offsets;
offsets->need_frame_pointer = frame_pointer_needed;
offsets->outgoing_args_size = crtl->outgoing_args_size;
offsets->pretend_size = crtl->args.pretend_args_size;
offsets->local_vars = get_frame_size ();
offsets->lr_needs_saving = (!leaf_function_p ()
|| df_regs_ever_live_p (LR_REGNUM)
|| df_regs_ever_live_p (LR_REGNUM + 1));
offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
&& frame_pointer_needed))
offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
/* Round up to 64-bit boundary to maintain stack alignment. */
offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
return offsets;
}
/* Insert code into the prologue or epilogue to store or load any
callee-save register to/from the stack.
Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
static void
move_callee_saved_registers (rtx sp, machine_function *offsets,
bool prologue)
{
int regno, offset, saved_scalars;
rtx exec = gen_rtx_REG (DImode, EXEC_REG);
rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
HOST_WIDE_INT exec_set = 0;
int offreg_set = 0;
auto_vec saved_sgprs;
start_sequence ();
/* Move scalars into two vector registers. */
for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
|| ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
&& offsets->need_frame_pointer))
{
rtx reg = gen_rtx_REG (SImode, regno);
rtx vreg = gen_rtx_REG (V64SImode,
VGPR_REGNO (6 + (saved_scalars / 64)));
int lane = saved_scalars % 64;
if (prologue)
{
emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
saved_sgprs.safe_push (regno);
}
else
emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
saved_scalars++;
}
rtx move_scalars = get_insns ();
end_sequence ();
start_sequence ();
/* Ensure that all vector lanes are moved. */
exec_set = -1;
emit_move_insn (exec, GEN_INT (exec_set));
/* Set up a vector stack pointer. */
rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
gcn_gen_undef (V64SImode), exec));
rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
exec));
emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
gcn_operand_part (V64SImode, vsp, 0),
_0_4_8_12, vcc, gcn_gen_undef (V64SImode),
exec));
emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
gcn_operand_part (V64SImode, vsp, 1),
const0_rtx, vcc, vcc,
gcn_gen_undef (V64SImode), exec));
/* Move vectors. */
for (regno = FIRST_VGPR_REG, offset = 0;
regno < FIRST_PSEUDO_REGISTER; regno++)
if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
|| (regno == VGPR_REGNO (6) && saved_scalars > 0)
|| (regno == VGPR_REGNO (7) && saved_scalars > 63))
{
rtx reg = gen_rtx_REG (V64SImode, regno);
int size = 256;
if (regno == VGPR_REGNO (6) && saved_scalars < 64)
size = saved_scalars * 4;
else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
size = (saved_scalars - 64) * 4;
if (size != 256 || exec_set != -1)
{
exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
emit_move_insn (exec, gen_int_mode (exec_set, DImode));
}
if (prologue)
{
rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec
(vsp, const0_rtx, reg, as, const0_rtx,
exec));
/* Add CFI metadata. */
rtx note;
if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7))
{
int start = (regno == VGPR_REGNO (7) ? 64 : 0);
int count = MIN (saved_scalars - start, 64);
int add_lr = (regno == VGPR_REGNO (6)
&& offsets->lr_needs_saving);
int lrdest = -1;
rtvec seq = rtvec_alloc (count + add_lr);
/* Add an REG_FRAME_RELATED_EXPR entry for each scalar
register that was saved in this batch. */
for (int idx = 0; idx < count; idx++)
{
int stackaddr = offset + idx * 4;
rtx dest = gen_rtx_MEM (SImode,
gen_rtx_PLUS
(DImode, sp,
GEN_INT (stackaddr)));
rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]);
rtx set = gen_rtx_SET (dest, src);
RTX_FRAME_RELATED_P (set) = 1;
RTVEC_ELT (seq, idx) = set;
if (saved_sgprs[start + idx] == LINK_REGNUM)
lrdest = stackaddr;
}
/* Add an additional expression for DWARF_LINK_REGISTER if
LINK_REGNUM was saved. */
if (lrdest != -1)
{
rtx dest = gen_rtx_MEM (DImode,
gen_rtx_PLUS
(DImode, sp,
GEN_INT (lrdest)));
rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER);
rtx set = gen_rtx_SET (dest, src);
RTX_FRAME_RELATED_P (set) = 1;
RTVEC_ELT (seq, count) = set;
}
note = gen_rtx_SEQUENCE (VOIDmode, seq);
}
else
{
rtx dest = gen_rtx_MEM (V64SImode,
gen_rtx_PLUS (DImode, sp,
GEN_INT (offset)));
rtx src = gen_rtx_REG (V64SImode, regno);
note = gen_rtx_SET (dest, src);
}
RTX_FRAME_RELATED_P (insn) = 1;
add_reg_note (insn, REG_FRAME_RELATED_EXPR, note);
}
else
emit_insn (gen_gatherv64si_insn_1offset_exec
(reg, vsp, const0_rtx, as, const0_rtx,
gcn_gen_undef (V64SImode), exec));
/* Move our VSP to the next stack entry. */
if (offreg_set != size)
{
offreg_set = size;
emit_move_insn (offreg, GEN_INT (size));
}
if (exec_set != -1)
{
exec_set = -1;
emit_move_insn (exec, GEN_INT (exec_set));
}
emit_insn (gen_addv64si3_vcc_dup_exec
(gcn_operand_part (V64SImode, vsp, 0),
offreg, gcn_operand_part (V64SImode, vsp, 0),
vcc, gcn_gen_undef (V64SImode), exec));
emit_insn (gen_addcv64si3_exec
(gcn_operand_part (V64SImode, vsp, 1),
gcn_operand_part (V64SImode, vsp, 1),
const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
offset += size;
}
rtx move_vectors = get_insns ();
end_sequence ();
if (prologue)
{
emit_insn (move_scalars);
emit_insn (move_vectors);
}
else
{
emit_insn (move_vectors);
emit_insn (move_scalars);
}
}
/* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
For a non-kernel function, the stack layout looks like this (interim),
growing *upwards*:
hi | + ...
|__________________| <-- current SP
| outgoing args |
|__________________|
| (alloca space) |
|__________________|
| local vars |
|__________________| <-- FP/hard FP
| callee-save regs |
|__________________| <-- soft arg pointer
| pretend args |
|__________________| <-- incoming SP
| incoming args |
lo |..................|
This implies arguments (beyond the first N in registers) must grow
downwards (as, apparently, PA has them do).
For a kernel function we have the simpler:
hi | + ...
|__________________| <-- current SP
| outgoing args |
|__________________|
| (alloca space) |
|__________________|
| local vars |
lo |__________________| <-- FP/hard FP
*/
void
gcn_expand_prologue ()
{
machine_function *offsets = gcn_compute_frame_offsets ();
if (!cfun || !cfun->machine || cfun->machine->normal_function)
{
rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
rtx sp_hi = gcn_operand_part (Pmode, sp, 1);
rtx sp_lo = gcn_operand_part (Pmode, sp, 0);
rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
rtx fp_hi = gcn_operand_part (Pmode, fp, 1);
rtx fp_lo = gcn_operand_part (Pmode, fp, 0);
start_sequence ();
if (offsets->pretend_size > 0)
{
/* FIXME: Do the actual saving of register pretend args to the stack.
Register order needs consideration. */
}
/* Save callee-save regs. */
move_callee_saved_registers (sp, offsets, true);
HOST_WIDE_INT sp_adjust = offsets->pretend_size
+ offsets->callee_saves
+ offsets->local_vars + offsets->outgoing_args_size;
if (sp_adjust > 0)
{
/* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
we use split add explictly, and specify the DImode add in
the note. */
rtx scc = gen_rtx_REG (BImode, SCC_REG);
rtx adjustment = gen_int_mode (sp_adjust, SImode);
rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo,
adjustment, scc));
if (!offsets->need_frame_pointer)
{
RTX_FRAME_RELATED_P (insn) = 1;
add_reg_note (insn, REG_FRAME_RELATED_EXPR,
gen_rtx_SET (sp,
gen_rtx_PLUS (DImode, sp,
adjustment)));
}
emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc));
}
if (offsets->need_frame_pointer)
{
/* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
we use split add explictly, and specify the DImode add in
the note. */
rtx scc = gen_rtx_REG (BImode, SCC_REG);
int fp_adjust = -(offsets->local_vars + offsets->outgoing_args_size);
rtx adjustment = gen_int_mode (fp_adjust, SImode);
rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo,
adjustment, scc));
emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi,
(fp_adjust < 0 ? GEN_INT (-1)
: const0_rtx),
scc, scc));
/* Set the CFA to the entry stack address, as an offset from the
frame pointer. This is preferred because the frame pointer is
saved in each frame, whereas the stack pointer is not. */
RTX_FRAME_RELATED_P (insn) = 1;
add_reg_note (insn, REG_CFA_DEF_CFA,
gen_rtx_PLUS (DImode, fp,
GEN_INT (-(offsets->pretend_size
+ offsets->callee_saves))));
}
rtx_insn *seq = get_insns ();
end_sequence ();
emit_insn (seq);
}
else
{
rtx wave_offset = gen_rtx_REG (SImode,
cfun->machine->args.
reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
{
rtx fs_init_lo =
gen_rtx_REG (SImode,
cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
rtx fs_init_hi =
gen_rtx_REG (SImode,
cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
/*rtx queue = gen_rtx_REG(DImode,
cfun->machine->args.reg[QUEUE_PTR_ARG]);
rtx aperture = gen_rtx_MEM (SImode,
gen_rtx_PLUS (DImode, queue,
gen_int_mode (68, SImode)));
set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
/* Set up flat_scratch. */
emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
gen_int_mode (8, SImode)));
emit_move_insn (fs_reg_lo, fs_init_hi);
}
/* Set up frame pointer and stack pointer. */
rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
rtx sp_hi = simplify_gen_subreg (SImode, sp, DImode, 4);
rtx sp_lo = simplify_gen_subreg (SImode, sp, DImode, 0);
rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
HOST_WIDE_INT sp_adjust = (offsets->local_vars
+ offsets->outgoing_args_size);
/* Initialise FP and SP from the buffer descriptor in s[0:3]. */
emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
gen_int_mode (0xffff, SImode)));
rtx scc = gen_rtx_REG (BImode, SCC_REG);
emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
/* Adding RTX_FRAME_RELATED_P effectively disables spliting, so we use
split add explictly, and specify the DImode add in the note.
The DWARF info expects that the callee-save data is in the frame,
even though it isn't (because this is the entry point), so we
make a notional adjustment to the DWARF frame offset here. */
rtx dbg_adjustment = gen_int_mode (sp_adjust + offsets->callee_saves,
DImode);
rtx insn;
if (sp_adjust > 0)
{
rtx scc = gen_rtx_REG (BImode, SCC_REG);
rtx adjustment = gen_int_mode (sp_adjust, DImode);
insn = emit_insn (gen_addsi3_scalar_carry(sp_lo, fp_lo, adjustment,
scc));
emit_insn (gen_addcsi3_scalar_zero (sp_hi, fp_hi, scc));
}
else
insn = emit_move_insn (sp, fp);
RTX_FRAME_RELATED_P (insn) = 1;
add_reg_note (insn, REG_FRAME_RELATED_EXPR,
gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp,
dbg_adjustment)));
if (offsets->need_frame_pointer)
{
/* Set the CFA to the entry stack address, as an offset from the
frame pointer. This is necessary when alloca is used, and
harmless otherwise. */
rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode);
add_reg_note (insn, REG_CFA_DEF_CFA,
gen_rtx_PLUS (DImode, fp, neg_adjust));
}
/* Make sure the flat scratch reg doesn't get optimised away. */
emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
}
/* Ensure that the scheduler doesn't do anything unexpected. */
emit_insn (gen_blockage ());
/* m0 is initialized for the usual LDS DS and FLAT memory case.
The low-part is the address of the topmost addressable byte, which is
size-1. The high-part is an offset and should be zero. */
emit_move_insn (gen_rtx_REG (SImode, M0_REG),
gen_int_mode (LDS_SIZE, SImode));
emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
{
/* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
"gomp_gcn_enter_kernel"));
emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
}
}
/* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
See gcn_expand_prologue for stack details. */
void
gcn_expand_epilogue (void)
{
/* Ensure that the scheduler doesn't do anything unexpected. */
emit_insn (gen_blockage ());
if (!cfun || !cfun->machine || cfun->machine->normal_function)
{
machine_function *offsets = gcn_compute_frame_offsets ();
rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
if (offsets->need_frame_pointer)
{
/* Restore old SP from the frame pointer. */
if (sp_adjust > 0)
emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
else
emit_move_insn (sp, fp);
}
else
{
/* Restore old SP from current SP. */
sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
if (sp_adjust > 0)
emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
}
move_callee_saved_registers (sp, offsets, false);
/* There's no explicit use of the link register on the return insn. Emit
one here instead. */
if (offsets->lr_needs_saving)
emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
/* Similar for frame pointer. */
if (offsets->need_frame_pointer)
emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
}
else if (flag_openmp)
{
/* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
emit_move_insn (fn_reg,
gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
}
else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
{
/* Assume that an exit value compatible with gcn-run is expected.
That is, the third input parameter is an int*.
We can't allocate any new registers, but the kernarg_reg is
dead after this, so we'll use that. */
rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
[KERNARG_SEGMENT_PTR_ARG]);
rtx retptr_mem = gen_rtx_MEM (DImode,
gen_rtx_PLUS (DImode, kernarg_reg,
GEN_INT (16)));
set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
emit_move_insn (kernarg_reg, retptr_mem);
rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
emit_move_insn (retval_mem,
gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
}
emit_jump_insn (gen_gcn_return ());
}
/* Implement TARGET_FRAME_POINTER_REQUIRED.
Return true if the frame pointer should not be eliminated. */
bool
gcn_frame_pointer_rqd (void)
{
/* GDB needs the frame pointer in order to unwind properly,
but that's not important for the entry point, unless alloca is used.
It's not important for code execution, so we should repect the
-fomit-frame-pointer flag. */
return (!flag_omit_frame_pointer
&& cfun
&& (cfun->calls_alloca
|| (cfun->machine && cfun->machine->normal_function)));
}
/* Implement TARGET_CAN_ELIMINATE.
Return true if the compiler is allowed to try to replace register number
FROM_REG with register number TO_REG.
FIXME: is the default "true" not enough? Should this be a negative set? */
bool
gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
{
return (to_reg == HARD_FRAME_POINTER_REGNUM
|| to_reg == STACK_POINTER_REGNUM);
}
/* Implement INITIAL_ELIMINATION_OFFSET.
Returns the initial difference between the specified pair of registers, in
terms of stack position. */
HOST_WIDE_INT
gcn_initial_elimination_offset (int from, int to)
{
machine_function *offsets = gcn_compute_frame_offsets ();
switch (from)
{
case ARG_POINTER_REGNUM:
if (to == STACK_POINTER_REGNUM)
return -(offsets->callee_saves + offsets->local_vars
+ offsets->outgoing_args_size);
else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
return -offsets->callee_saves;
else
gcc_unreachable ();
break;
case FRAME_POINTER_REGNUM:
if (to == STACK_POINTER_REGNUM)
return -(offsets->local_vars + offsets->outgoing_args_size);
else if (to == HARD_FRAME_POINTER_REGNUM)
return 0;
else
gcc_unreachable ();
break;
default:
gcc_unreachable ();
}
}
/* Implement HARD_REGNO_RENAME_OK.
Return true if it is permissible to rename a hard register from
FROM_REG to TO_REG. */
bool
gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
{
if (from_reg == SCC_REG
|| from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
|| from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
|| to_reg == SCC_REG
|| to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
|| to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
return false;
/* Allow the link register to be used if it was saved. */
if ((to_reg & ~1) == LINK_REGNUM)
return !cfun || cfun->machine->lr_needs_saving;
/* Allow the registers used for the static chain to be used if the chain is
not in active use. */
if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
return !cfun
|| !(cfun->static_chain_decl
&& df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
&& df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
return true;
}
/* Implement HARD_REGNO_CALLER_SAVE_MODE.
Which mode is required for saving NREGS of a pseudo-register in
call-clobbered hard register REGNO. */
machine_mode
gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
machine_mode regmode)
{
machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
result = (nregs == 1 ? SImode : DImode);
return result;
}
/* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
Output assembler code for a block containing the constant parts
of a trampoline, leaving space for the variable parts. */
static void
gcn_asm_trampoline_template (FILE *f)
{
/* The source operand of the move instructions must be a 32-bit
constant following the opcode. */
asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
}
/* Implement TARGET_TRAMPOLINE_INIT.
Emit RTL insns to initialize the variable parts of a trampoline.
FNDECL is the decl of the target address, M_TRAMP is a MEM for
the trampoline, and CHAIN_VALUE is an RTX for the static chain
to be passed to the target function. */
static void
gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
{
if (TARGET_GCN5_PLUS)
sorry ("nested function trampolines not supported on GCN5 due to"
" non-executable stacks");
emit_block_move (m_tramp, assemble_trampoline_template (),
GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
rtx chain_value_reg = copy_to_reg (chain_value);
rtx fnaddr_reg = copy_to_reg (fnaddr);
for (int i = 0; i < 4; i++)
{
rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
}
rtx tramp_addr = XEXP (m_tramp, 0);
emit_insn (gen_clear_icache (tramp_addr,
plus_constant (ptr_mode, tramp_addr,
TRAMPOLINE_SIZE)));
}
/* }}} */
/* {{{ Miscellaneous. */
/* Implement TARGET_CANNOT_COPY_INSN_P.
Return true if INSN must not be duplicated. */
static bool
gcn_cannot_copy_insn_p (rtx_insn *insn)
{
if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
return true;
return false;
}
/* Implement TARGET_DEBUG_UNWIND_INFO.
Defines the mechanism that will be used for describing frame unwind
information to the debugger. */
static enum unwind_info_type
gcn_debug_unwind_info ()
{
return UI_DWARF2;
}
/* Determine if there is a suitable hardware conversion instruction.
Used primarily by the machine description. */
bool
gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
{
if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
return false;
if (VECTOR_MODE_P (from))
{
from = GET_MODE_INNER (from);
to = GET_MODE_INNER (to);
}
switch (op)
{
case fix_trunc_cvt:
case fixuns_trunc_cvt:
if (GET_MODE_CLASS (from) != MODE_FLOAT
|| GET_MODE_CLASS (to) != MODE_INT)
return false;
break;
case float_cvt:
case floatuns_cvt:
if (GET_MODE_CLASS (from) != MODE_INT
|| GET_MODE_CLASS (to) != MODE_FLOAT)
return false;
break;
case extend_cvt:
if (GET_MODE_CLASS (from) != MODE_FLOAT
|| GET_MODE_CLASS (to) != MODE_FLOAT
|| GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
return false;
break;
case trunc_cvt:
if (GET_MODE_CLASS (from) != MODE_FLOAT
|| GET_MODE_CLASS (to) != MODE_FLOAT
|| GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
return false;
break;
}
return ((to == HImode && from == HFmode)
|| (to == SImode && (from == SFmode || from == DFmode))
|| (to == HFmode && (from == HImode || from == SFmode))
|| (to == SFmode && (from == SImode || from == HFmode
|| from == DFmode))
|| (to == DFmode && (from == SImode || from == SFmode)));
}
/* Implement TARGET_EMUTLS_VAR_INIT.
Disable emutls (gthr-gcn.h does not support it, yet). */
tree
gcn_emutls_var_init (tree, tree decl, tree)
{
sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
return NULL_TREE;
}
/* }}} */
/* {{{ Costs. */
/* Implement TARGET_RTX_COSTS.
Compute a (partial) cost for rtx X. Return true if the complete
cost has been computed, and false if subexpressions should be
scanned. In either case, *TOTAL contains the cost result. */
static bool
gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
{
enum rtx_code code = GET_CODE (x);
switch (code)
{
case CONST:
case CONST_DOUBLE:
case CONST_VECTOR:
case CONST_INT:
if (gcn_inline_constant_p (x))
*total = 0;
else if (code == CONST_INT
&& ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
*total = 1;
else if (gcn_constant_p (x))
*total = 2;
else
*total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
return true;
case DIV:
*total = 100;
return false;
default:
*total = 3;
return false;
}
}
/* Implement TARGET_MEMORY_MOVE_COST.
Return the cost of moving data of mode M between a
register and memory. A value of 2 is the default; this cost is
relative to those in `REGISTER_MOVE_COST'.
This function is used extensively by register_move_cost that is used to
build tables at startup. Make it inline in this case.
When IN is 2, return maximum of in and out move cost.
If moving between registers and memory is more expensive than
between two registers, you should define this macro to express the
relative cost.
Model also increased moving costs of QImode registers in non
Q_REGS classes. */
#define LOAD_COST 32
#define STORE_COST 32
static int
gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
{
int nregs = CEIL (GET_MODE_SIZE (mode), 4);
switch (regclass)
{
case SCC_CONDITIONAL_REG:
case VCCZ_CONDITIONAL_REG:
case VCC_CONDITIONAL_REG:
case EXECZ_CONDITIONAL_REG:
case ALL_CONDITIONAL_REGS:
case SGPR_REGS:
case SGPR_EXEC_REGS:
case EXEC_MASK_REG:
case SGPR_VOP_SRC_REGS:
case SGPR_MEM_SRC_REGS:
case SGPR_SRC_REGS:
case SGPR_DST_REGS:
case GENERAL_REGS:
case AFP_REGS:
if (!in)
return (STORE_COST + 2) * nregs;
return LOAD_COST * nregs;
case VGPR_REGS:
if (in)
return (LOAD_COST + 2) * nregs;
return STORE_COST * nregs;
case ALL_REGS:
case ALL_GPR_REGS:
case SRCDST_REGS:
if (in)
return (LOAD_COST + 2) * nregs;
return (STORE_COST + 2) * nregs;
default:
gcc_unreachable ();
}
}
/* Implement TARGET_REGISTER_MOVE_COST.
Return the cost of moving data from a register in class CLASS1 to
one in class CLASS2. Base value is 2. */
static int
gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
{
/* Increase cost of moving from and to vector registers. While this is
fast in hardware (I think), it has hidden cost of setting up the exec
flags. */
if ((src < VGPR_REGS) != (dst < VGPR_REGS))
return 4;
return 2;
}
/* }}} */
/* {{{ Builtins. */
/* Type codes used by GCN built-in definitions. */
enum gcn_builtin_type_index
{
GCN_BTI_END_OF_PARAMS,
GCN_BTI_VOID,
GCN_BTI_BOOL,
GCN_BTI_INT,
GCN_BTI_UINT,
GCN_BTI_SIZE_T,
GCN_BTI_LLINT,
GCN_BTI_LLUINT,
GCN_BTI_EXEC,
GCN_BTI_SF,
GCN_BTI_V64SI,
GCN_BTI_V64SF,
GCN_BTI_V64PTR,
GCN_BTI_SIPTR,
GCN_BTI_SFPTR,
GCN_BTI_VOIDPTR,
GCN_BTI_LDS_VOIDPTR,
GCN_BTI_MAX
};
static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
#define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
#define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
#define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
#define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
#define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
#define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
#define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
#define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
#define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
struct gcn_builtin_description *);
static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
struct gcn_builtin_description *);
struct gcn_builtin_description;
typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
struct gcn_builtin_description *);
enum gcn_builtin_type
{
B_UNIMPLEMENTED, /* Sorry out */
B_INSN, /* Emit a pattern */
B_OVERLOAD /* Placeholder for an overloaded function */
};
struct gcn_builtin_description
{
int fcode;
int icode;
const char *name;
enum gcn_builtin_type type;
/* The first element of parm is always the return type. The rest
are a zero terminated list of parameters. */
int parm[6];
gcn_builtin_expander expander;
};
/* Read in the GCN builtins from gcn-builtins.def. */
extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
struct gcn_builtin_description gcn_builtins[] = {
#define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
{GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
#define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
{GCN_BUILTIN_ ## fcode ## _V64SI, \
CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \
{GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
{GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \
{GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
#include "gcn-builtins.def"
#undef DEF_BUILTIN_BINOP_INT_FP
#undef DEF_BUILTIN
};
static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
/* Implement TARGET_BUILTIN_DECL.
Return the GCN builtin for CODE. */
tree
gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
{
if (code >= GCN_BUILTIN_MAX)
return error_mark_node;
return gcn_builtin_decls[code];
}
/* Helper function for gcn_init_builtins. */
static void
gcn_init_builtin_types (void)
{
gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
exec_type_node = unsigned_intDI_type_node;
sf_type_node = float32_type_node;
v64si_type_node = build_vector_type (intSI_type_node, 64);
v64sf_type_node = build_vector_type (float_type_node, 64);
v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
/*build_pointer_type
(integer_type_node) */
, 64);
tree tmp = build_distinct_type_copy (intSI_type_node);
TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
siptr_type_node = build_pointer_type (tmp);
tmp = build_distinct_type_copy (float_type_node);
TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
sfptr_type_node = build_pointer_type (tmp);
tmp = build_distinct_type_copy (void_type_node);
TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
voidptr_type_node = build_pointer_type (tmp);
tmp = build_distinct_type_copy (void_type_node);
TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
}
/* Implement TARGET_INIT_BUILTINS.
Set up all builtin functions for this target. */
static void
gcn_init_builtins (void)
{
gcn_init_builtin_types ();
struct gcn_builtin_description *d;
unsigned int i;
for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
{
tree p;
char name[64]; /* build_function will make a copy. */
int parm;
/* FIXME: Is this necessary/useful? */
if (d->name == 0)
continue;
/* Find last parm. */
for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
;
p = void_list_node;
while (parm > 1)
p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
p = build_function_type (gcn_builtin_types[d->parm[0]], p);
sprintf (name, "__builtin_gcn_%s", d->name);
gcn_builtin_decls[i]
= add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
/* These builtins don't throw. */
TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
}
/* These builtins need to take/return an LDS pointer: override the generic
versions here. */
set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
false);
set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
false);
set_builtin_decl (BUILT_IN_GOACC_BARRIER,
gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
}
/* Implement TARGET_INIT_LIBFUNCS. */
static void
gcn_init_libfuncs (void)
{
/* BITS_PER_UNIT * 2 is 64 bits, which causes
optabs-libfuncs.cc:gen_int_libfunc to omit TImode (i.e 128 bits)
libcalls that we need to support operations for that type. Initialise
them here instead. */
set_optab_libfunc (udiv_optab, TImode, "__udivti3");
set_optab_libfunc (umod_optab, TImode, "__umodti3");
set_optab_libfunc (sdiv_optab, TImode, "__divti3");
set_optab_libfunc (smod_optab, TImode, "__modti3");
set_optab_libfunc (smul_optab, TImode, "__multi3");
set_optab_libfunc (addv_optab, TImode, "__addvti3");
set_optab_libfunc (subv_optab, TImode, "__subvti3");
set_optab_libfunc (negv_optab, TImode, "__negvti2");
set_optab_libfunc (absv_optab, TImode, "__absvti2");
set_optab_libfunc (smulv_optab, TImode, "__mulvti3");
set_optab_libfunc (ffs_optab, TImode, "__ffsti2");
set_optab_libfunc (clz_optab, TImode, "__clzti2");
set_optab_libfunc (ctz_optab, TImode, "__ctzti2");
set_optab_libfunc (clrsb_optab, TImode, "__clrsbti2");
set_optab_libfunc (popcount_optab, TImode, "__popcountti2");
set_optab_libfunc (parity_optab, TImode, "__parityti2");
set_optab_libfunc (bswap_optab, TImode, "__bswapti2");
}
/* Expand the CMP_SWAP GCN builtins. We have our own versions that do
not require taking the address of any object, other than the memory
cell being operated on.
Helper function for gcn_expand_builtin_1. */
static rtx
gcn_expand_cmp_swap (tree exp, rtx target)
{
machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
addr_space_t as
= TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
machine_mode as_mode = gcn_addr_space_address_mode (as);
if (!target)
target = gen_reg_rtx (mode);
rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
NULL_RTX, as_mode, EXPAND_NORMAL);
rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
NULL_RTX, mode, EXPAND_NORMAL);
rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
NULL_RTX, mode, EXPAND_NORMAL);
rtx pat;
rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
set_mem_addr_space (mem, as);
if (!REG_P (cmp))
cmp = copy_to_mode_reg (mode, cmp);
if (!REG_P (src))
src = copy_to_mode_reg (mode, src);
if (mode == SImode)
pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
else
pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
emit_insn (pat);
return target;
}
/* Expand many different builtins.
Intended for use in gcn-builtins.def. */
static rtx
gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
machine_mode /*mode */ , int ignore,
struct gcn_builtin_description *)
{
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
switch (DECL_MD_FUNCTION_CODE (fndecl))
{
case GCN_BUILTIN_FLAT_LOAD_INT32:
{
if (ignore)
return target;
/*rtx exec = */
force_reg (DImode,
expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
EXPAND_NORMAL));
/*rtx ptr = */
force_reg (V64DImode,
expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
EXPAND_NORMAL));
/*emit_insn (gen_vector_flat_loadv64si
(target, gcn_gen_undef (V64SImode), ptr, exec)); */
return target;
}
case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
{
if (ignore)
return target;
rtx exec = force_reg (DImode,
expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
DImode,
EXPAND_NORMAL));
rtx ptr = force_reg (DImode,
expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
V64DImode,
EXPAND_NORMAL));
rtx offsets = force_reg (V64SImode,
expand_expr (CALL_EXPR_ARG (exp, 2),
NULL_RTX, V64DImode,
EXPAND_NORMAL));
rtx addrs = gen_reg_rtx (V64DImode);
rtx tmp = gen_reg_rtx (V64SImode);
emit_insn (gen_ashlv64si3_exec (tmp, offsets,
GEN_INT (2),
gcn_gen_undef (V64SImode), exec));
emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
gcn_gen_undef (V64DImode),
exec));
rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
/* FIXME: set attributes. */
emit_insn (gen_mov_with_exec (target, mem, exec));
return target;
}
case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
{
rtx exec = force_reg (DImode,
expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
DImode,
EXPAND_NORMAL));
rtx ptr = force_reg (DImode,
expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
V64DImode,
EXPAND_NORMAL));
rtx offsets = force_reg (V64SImode,
expand_expr (CALL_EXPR_ARG (exp, 2),
NULL_RTX, V64DImode,
EXPAND_NORMAL));
machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3)));
rtx val = force_reg (vmode,
expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
vmode,
EXPAND_NORMAL));
rtx addrs = gen_reg_rtx (V64DImode);
rtx tmp = gen_reg_rtx (V64SImode);
emit_insn (gen_ashlv64si3_exec (tmp, offsets,
GEN_INT (2),
gcn_gen_undef (V64SImode), exec));
emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
gcn_gen_undef (V64DImode),
exec));
rtx mem = gen_rtx_MEM (vmode, addrs);
/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
/* FIXME: set attributes. */
emit_insn (gen_mov_with_exec (mem, val, exec));
return target;
}
case GCN_BUILTIN_SQRTVF:
{
if (ignore)
return target;
rtx exec = gcn_full_exec_reg ();
rtx arg = force_reg (V64SFmode,
expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
V64SFmode,
EXPAND_NORMAL));
emit_insn (gen_sqrtv64sf2_exec
(target, arg, gcn_gen_undef (V64SFmode), exec));
return target;
}
case GCN_BUILTIN_SQRTF:
{
if (ignore)
return target;
rtx arg = force_reg (SFmode,
expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
SFmode,
EXPAND_NORMAL));
emit_insn (gen_sqrtsf2 (target, arg));
return target;
}
case GCN_BUILTIN_OMP_DIM_SIZE:
{
if (ignore)
return target;
emit_insn (gen_oacc_dim_size (target,
expand_expr (CALL_EXPR_ARG (exp, 0),
NULL_RTX, SImode,
EXPAND_NORMAL)));
return target;
}
case GCN_BUILTIN_OMP_DIM_POS:
{
if (ignore)
return target;
emit_insn (gen_oacc_dim_pos (target,
expand_expr (CALL_EXPR_ARG (exp, 0),
NULL_RTX, SImode,
EXPAND_NORMAL)));
return target;
}
case GCN_BUILTIN_CMP_SWAP:
case GCN_BUILTIN_CMP_SWAPLL:
return gcn_expand_cmp_swap (exp, target);
case GCN_BUILTIN_ACC_SINGLE_START:
{
if (ignore)
return target;
rtx wavefront = gcn_oacc_dim_pos (1);
rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
return cc;
}
case GCN_BUILTIN_ACC_SINGLE_COPY_START:
{
rtx blk = force_reg (SImode,
expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
SImode, EXPAND_NORMAL));
rtx wavefront = gcn_oacc_dim_pos (1);
rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
rtx not_zero = gen_label_rtx ();
emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
emit_move_insn (blk, const0_rtx);
emit_label (not_zero);
return blk;
}
case GCN_BUILTIN_ACC_SINGLE_COPY_END:
return target;
case GCN_BUILTIN_ACC_BARRIER:
emit_insn (gen_gcn_wavefront_barrier ());
return target;
default:
gcc_unreachable ();
}
}
/* Expansion of simple arithmetic and bit binary operation builtins.
Intended for use with gcn_builtins table. */
static rtx
gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
machine_mode /*mode */ , int ignore,
struct gcn_builtin_description *d)
{
int icode = d->icode;
if (ignore)
return target;
rtx exec = force_reg (DImode,
expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
EXPAND_NORMAL));
machine_mode m1 = insn_data[icode].operand[1].mode;
rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
EXPAND_NORMAL);
if (!insn_data[icode].operand[1].predicate (arg1, m1))
arg1 = force_reg (m1, arg1);
machine_mode m2 = insn_data[icode].operand[2].mode;
rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
EXPAND_NORMAL);
if (!insn_data[icode].operand[2].predicate (arg2, m2))
arg2 = force_reg (m2, arg2);
rtx arg_prev;
if (call_expr_nargs (exp) == 4)
{
machine_mode m_prev = insn_data[icode].operand[4].mode;
arg_prev = force_reg (m_prev,
expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
m_prev, EXPAND_NORMAL));
}
else
arg_prev = gcn_gen_undef (GET_MODE (target));
rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
emit_insn (pat);
return target;
}
/* Implement TARGET_EXPAND_BUILTIN.
Expand an expression EXP that calls a built-in function, with result going
to TARGET if that's convenient (and in mode MODE if that's convenient).
SUBTARGET may be used as the target for computing one of EXP's operands.
IGNORE is nonzero if the value is to be ignored. */
rtx
gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
int ignore)
{
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
struct gcn_builtin_description *d;
gcc_assert (fcode < GCN_BUILTIN_MAX);
d = &gcn_builtins[fcode];
if (d->type == B_UNIMPLEMENTED)
sorry ("Builtin not implemented");
return d->expander (exp, target, subtarget, mode, ignore, d);
}
/* }}} */
/* {{{ Vectorization. */
/* Implement TARGET_VECTORIZE_GET_MASK_MODE.
A vector mask is a value that holds one boolean result for every element in
a vector. */
opt_machine_mode
gcn_vectorize_get_mask_mode (machine_mode)
{
/* GCN uses a DImode bit-mask. */
return DImode;
}
/* Return an RTX that references a vector with the i-th lane containing
PERM[i]*4.
Helper function for gcn_vectorize_vec_perm_const. */
static rtx
gcn_make_vec_perm_address (unsigned int *perm)
{
rtx x = gen_reg_rtx (V64SImode);
emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
/* Permutation addresses use byte addressing. With each vector lane being
4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
so only set those.
The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
select between lanes in two vectors, but as the DS_BPERMUTE* instructions
only take one source vector, the most-significant bit can be ignored
here. Instead, we can use EXEC masking to select the relevant part of
each source vector after they are permuted separately. */
uint64_t bit_mask = 1 << 2;
for (int i = 2; i < 8; i++, bit_mask <<= 1)
{
uint64_t exec_mask = 0;
uint64_t lane_mask = 1;
for (int j = 0; j < 64; j++, lane_mask <<= 1)
if ((perm[j] * 4) & bit_mask)
exec_mask |= lane_mask;
if (exec_mask)
emit_insn (gen_addv64si3_exec (x, x,
gcn_vec_constant (V64SImode,
bit_mask),
x, get_exec (exec_mask)));
}
return x;
}
/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
Return true if permutation with SEL is possible.
If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
permutations. */
static bool
gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
rtx src0, rtx src1,
const vec_perm_indices & sel)
{
unsigned int nelt = GET_MODE_NUNITS (vmode);
gcc_assert (VECTOR_MODE_P (vmode));
gcc_assert (nelt <= 64);
gcc_assert (sel.length () == nelt);
if (!dst)
{
/* All vector permutations are possible on this architecture,
with varying degrees of efficiency depending on the permutation. */
return true;
}
unsigned int perm[64];
for (unsigned int i = 0; i < nelt; ++i)
perm[i] = sel[i] & (2 * nelt - 1);
for (unsigned int i = nelt; i < 64; ++i)
perm[i] = 0;
src0 = force_reg (vmode, src0);
src1 = force_reg (vmode, src1);
/* Make life a bit easier by swapping operands if necessary so that
the first element always comes from src0. */
if (perm[0] >= nelt)
{
std::swap (src0, src1);
for (unsigned int i = 0; i < nelt; ++i)
if (perm[i] < nelt)
perm[i] += nelt;
else
perm[i] -= nelt;
}
/* TODO: There are more efficient ways to implement certain permutations
using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
this more inefficient generic approach is used. */
int64_t src1_lanes = 0;
int64_t lane_bit = 1;
for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
{
/* Set the bits for lanes from src1. */
if (perm[i] >= nelt)
src1_lanes |= lane_bit;
}
rtx addr = gcn_make_vec_perm_address (perm);
rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
switch (vmode)
{
case E_V64QImode:
ds_bpermute = gen_ds_bpermutev64qi;
break;
case E_V64HImode:
ds_bpermute = gen_ds_bpermutev64hi;
break;
case E_V64SImode:
ds_bpermute = gen_ds_bpermutev64si;
break;
case E_V64HFmode:
ds_bpermute = gen_ds_bpermutev64hf;
break;
case E_V64SFmode:
ds_bpermute = gen_ds_bpermutev64sf;
break;
case E_V64DImode:
ds_bpermute = gen_ds_bpermutev64di;
break;
case E_V64DFmode:
ds_bpermute = gen_ds_bpermutev64df;
break;
default:
gcc_assert (false);
}
/* Load elements from src0 to dst. */
gcc_assert (~src1_lanes);
emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
/* Load elements from src1 to dst. */
if (src1_lanes)
{
/* Masking a lane masks both the destination and source lanes for
DS_BPERMUTE, so we need to have all lanes enabled for the permute,
then add an extra masked move to merge the results of permuting
the two source vectors together.
*/
rtx tmp = gen_reg_rtx (vmode);
emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
}
return true;
}
/* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
Return nonzero if vector MODE is supported with at least move
instructions. */
static bool
gcn_vector_mode_supported_p (machine_mode mode)
{
return (mode == V64QImode || mode == V64HImode
|| mode == V64SImode || mode == V64DImode
|| mode == V64SFmode || mode == V64DFmode);
}
/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
Enables autovectorization for all supported modes. */
static machine_mode
gcn_vectorize_preferred_simd_mode (scalar_mode mode)
{
switch (mode)
{
case E_QImode:
return V64QImode;
case E_HImode:
return V64HImode;
case E_SImode:
return V64SImode;
case E_DImode:
return V64DImode;
case E_SFmode:
return V64SFmode;
case E_DFmode:
return V64DFmode;
default:
return word_mode;
}
}
/* Implement TARGET_VECTORIZE_RELATED_MODE.
All GCN vectors are 64-lane, so this is simpler than other architectures.
In particular, we do *not* want to match vector bit-size. */
static opt_machine_mode
gcn_related_vector_mode (machine_mode ARG_UNUSED (vector_mode),
scalar_mode element_mode, poly_uint64 nunits)
{
if (known_ne (nunits, 0U) && known_ne (nunits, 64U))
return VOIDmode;
machine_mode pref_mode = gcn_vectorize_preferred_simd_mode (element_mode);
if (!VECTOR_MODE_P (pref_mode))
return VOIDmode;
return pref_mode;
}
/* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
Returns the preferred alignment in bits for accesses to vectors of type type
in vectorized code. This might be less than or greater than the ABI-defined
value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
of a single element, in which case the vectorizer will not try to optimize
for alignment. */
static poly_uint64
gcn_preferred_vector_alignment (const_tree type)
{
return TYPE_ALIGN (TREE_TYPE (type));
}
/* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
Return true if the target supports misaligned vector store/load of a
specific factor denoted in the misalignment parameter. */
static bool
gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
const_tree type, int misalignment,
bool is_packed)
{
if (is_packed)
return false;
/* If the misalignment is unknown, we should be able to handle the access
so long as it is not to a member of a packed data structure. */
if (misalignment == -1)
return true;
/* Return true if the misalignment is a multiple of the natural alignment
of the vector's element type. This is probably always going to be
true in practice, since we've already established that this isn't a
packed access. */
return misalignment % TYPE_ALIGN_UNIT (type) == 0;
}
/* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
Return true if vector alignment is reachable (by peeling N iterations) for
the given scalar type TYPE. */
static bool
gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
{
/* Vectors which aren't in packed structures will not be less aligned than
the natural alignment of their element type, so this is safe. */
return !is_packed;
}
/* Generate DPP instructions used for vector reductions.
The opcode is given by INSN.
The first operand of the operation is shifted right by SHIFT vector lanes.
SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
broadcast the next row (thereby acting like a shift of 16 for the end of
each row). If SHIFT is 32, lane 31 is broadcast to all the
following lanes (thereby acting like a shift of 32 for lane 63). */
char *
gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
int unspec, int shift)
{
static char buf[128];
const char *dpp;
const char *vcc_in = "";
const char *vcc_out = "";
/* Add the vcc operand if needed. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
vcc_in = ", vcc";
if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
|| unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
vcc_out = ", vcc";
}
/* Add the DPP modifiers. */
switch (shift)
{
case 1:
dpp = "row_shr:1 bound_ctrl:0";
break;
case 2:
dpp = "row_shr:2 bound_ctrl:0";
break;
case 4:
dpp = "row_shr:4 bank_mask:0xe";
break;
case 8:
dpp = "row_shr:8 bank_mask:0xc";
break;
case 16:
dpp = "row_bcast:15 row_mask:0xa";
break;
case 32:
dpp = "row_bcast:31 row_mask:0xc";
break;
default:
gcc_unreachable ();
}
if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
insn, dpp, insn, dpp);
else if (unspec == UNSPEC_MOV_DPP_SHR)
sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
else
sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
return buf;
}
/* Generate vector reductions in terms of DPP instructions.
The vector register SRC of mode MODE is reduced using the operation given
by UNSPEC, and the scalar result is returned in lane 63 of a vector
register. */
rtx
gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
{
machine_mode orig_mode = mode;
bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
|| unspec == UNSPEC_SMAX_DPP_SHR
|| unspec == UNSPEC_UMIN_DPP_SHR
|| unspec == UNSPEC_UMAX_DPP_SHR)
&& (mode == V64DImode
|| mode == V64DFmode))
|| (unspec == UNSPEC_PLUS_DPP_SHR
&& mode == V64DFmode));
rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
: unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
: unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
: unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
: unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
: UNKNOWN);
bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
|| unspec == UNSPEC_SMAX_DPP_SHR
|| unspec == UNSPEC_UMIN_DPP_SHR
|| unspec == UNSPEC_UMAX_DPP_SHR)
&& (mode == V64QImode
|| mode == V64HImode));
bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
|| unspec == UNSPEC_UMAX_DPP_SHR);
bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
&& (TARGET_GCN3 || mode == V64DImode);
if (use_plus_carry)
unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
if (use_extends)
{
rtx tmp = gen_reg_rtx (V64SImode);
convert_move (tmp, src, unsignedp);
src = tmp;
mode = V64SImode;
}
/* Perform reduction by first performing the reduction operation on every
pair of lanes, then on every pair of results from the previous
iteration (thereby effectively reducing every 4 lanes) and so on until
all lanes are reduced. */
rtx in, out = force_reg (mode, src);
for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
{
rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
in = out;
out = gen_reg_rtx (mode);
if (use_moves)
{
rtx tmp = gen_reg_rtx (mode);
emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
}
else
{
rtx insn = gen_rtx_SET (out,
gen_rtx_UNSPEC (mode,
gen_rtvec (3, in, in,
shift_val),
unspec));
/* Add clobber for instructions that set the carry flags. */
if (use_plus_carry)
{
rtx clobber = gen_rtx_CLOBBER (VOIDmode,
gen_rtx_REG (DImode, VCC_REG));
insn = gen_rtx_PARALLEL (VOIDmode,
gen_rtvec (2, insn, clobber));
}
emit_insn (insn);
}
}
if (use_extends)
{
rtx tmp = gen_reg_rtx (orig_mode);
convert_move (tmp, out, unsignedp);
out = tmp;
}
return out;
}
/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
int
gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
{
/* Always vectorize. */
return 1;
}
/* }}} */
/* {{{ md_reorg pass. */
/* Identify VMEM instructions from their "type" attribute. */
static bool
gcn_vmem_insn_p (attr_type type)
{
switch (type)
{
case TYPE_MUBUF:
case TYPE_MTBUF:
case TYPE_FLAT:
return true;
case TYPE_UNKNOWN:
case TYPE_SOP1:
case TYPE_SOP2:
case TYPE_SOPK:
case TYPE_SOPC:
case TYPE_SOPP:
case TYPE_SMEM:
case TYPE_DS:
case TYPE_VOP2:
case TYPE_VOP1:
case TYPE_VOPC:
case TYPE_VOP3A:
case TYPE_VOP3B:
case TYPE_VOP_SDWA:
case TYPE_VOP_DPP:
case TYPE_MULT:
case TYPE_VMULT:
return false;
}
gcc_unreachable ();
return false;
}
/* If INSN sets the EXEC register to a constant value, return the value,
otherwise return zero. */
static int64_t
gcn_insn_exec_value (rtx_insn *insn)
{
if (!NONDEBUG_INSN_P (insn))
return 0;
rtx pattern = PATTERN (insn);
if (GET_CODE (pattern) == SET)
{
rtx dest = XEXP (pattern, 0);
rtx src = XEXP (pattern, 1);
if (GET_MODE (dest) == DImode
&& REG_P (dest) && REGNO (dest) == EXEC_REG
&& CONST_INT_P (src))
return INTVAL (src);
}
return 0;
}
/* Sets the EXEC register before INSN to the value that it had after
LAST_EXEC_DEF. The constant value of the EXEC register is returned if
known, otherwise it returns zero. */
static int64_t
gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
bool curr_exec_known, bool &last_exec_def_saved)
{
rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
rtx exec;
int64_t exec_value = gcn_insn_exec_value (last_exec_def);
if (exec_value)
{
/* If the EXEC value is a constant and it happens to be the same as the
current EXEC value, the restore can be skipped. */
if (curr_exec_known && exec_value == curr_exec)
return exec_value;
exec = GEN_INT (exec_value);
}
else
{
/* If the EXEC value is not a constant, save it in a register after the
point of definition. */
rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
if (!last_exec_def_saved)
{
start_sequence ();
emit_move_insn (exec_save_reg, exec_reg);
rtx_insn *seq = get_insns ();
end_sequence ();
emit_insn_after (seq, last_exec_def);
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Saving EXEC after insn %d.\n",
INSN_UID (last_exec_def));
last_exec_def_saved = true;
}
exec = exec_save_reg;
}
/* Restore EXEC register before the usage. */
start_sequence ();
emit_move_insn (exec_reg, exec);
rtx_insn *seq = get_insns ();
end_sequence ();
emit_insn_before (seq, insn);
if (dump_file && (dump_flags & TDF_DETAILS))
{
if (exec_value)
fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
exec_value, INSN_UID (insn));
else
fprintf (dump_file,
"Restoring EXEC from saved value before insn %d.\n",
INSN_UID (insn));
}
return exec_value;
}
/* Implement TARGET_MACHINE_DEPENDENT_REORG.
Ensure that pipeline dependencies and lane masking are set correctly. */
static void
gcn_md_reorg (void)
{
basic_block bb;
rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
regset_head live;
INIT_REG_SET (&live);
compute_bb_for_insn ();
if (!optimize)
{
split_all_insns ();
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "After split:\n");
print_rtl_with_bb (dump_file, get_insns (), dump_flags);
}
/* Update data-flow information for split instructions. */
df_insn_rescan_all ();
}
df_live_add_problem ();
df_live_set_all_dirty ();
df_analyze ();
/* This pass ensures that the EXEC register is set correctly, according
to the "exec" attribute. However, care must be taken so that the
value that reaches explicit uses of the EXEC register remains the
same as before.
*/
FOR_EACH_BB_FN (bb, cfun)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "BB %d:\n", bb->index);
rtx_insn *insn, *curr;
rtx_insn *last_exec_def = BB_HEAD (bb);
bool last_exec_def_saved = false;
bool curr_exec_explicit = true;
bool curr_exec_known = true;
int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
after last_exec_def is executed'. */
bitmap live_in = DF_LR_IN (bb);
bool exec_live_on_entry = false;
if (bitmap_bit_p (live_in, EXEC_LO_REG)
|| bitmap_bit_p (live_in, EXEC_HI_REG))
{
if (dump_file)
fprintf (dump_file, "EXEC reg is live on entry to block %d\n",
(int) bb->index);
exec_live_on_entry = true;
}
FOR_BB_INSNS_SAFE (bb, insn, curr)
{
if (!NONDEBUG_INSN_P (insn))
continue;
if (GET_CODE (PATTERN (insn)) == USE
|| GET_CODE (PATTERN (insn)) == CLOBBER)
continue;
HARD_REG_SET defs, uses;
CLEAR_HARD_REG_SET (defs);
CLEAR_HARD_REG_SET (uses);
note_stores (insn, record_hard_reg_sets, &defs);
note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
bool exec_used = (hard_reg_set_intersect_p
(uses, reg_class_contents[(int) EXEC_MASK_REG])
|| TEST_HARD_REG_BIT (uses, EXECZ_REG));
/* Check the instruction for implicit setting of EXEC via an
attribute. */
attr_exec exec_attr = get_attr_exec (insn);
int64_t new_exec;
switch (exec_attr)
{
case EXEC_NONE:
new_exec = 0;
break;
case EXEC_SINGLE:
/* Instructions that do not involve memory accesses only require
bit 0 of EXEC to be set. */
if (gcn_vmem_insn_p (get_attr_type (insn))
|| get_attr_type (insn) == TYPE_DS)
new_exec = 1;
else
new_exec = curr_exec | 1;
break;
case EXEC_FULL:
new_exec = -1;
break;
default: /* Auto-detect what setting is appropriate. */
{
new_exec = 0;
/* If EXEC is referenced explicitly then we don't need to do
anything to set it, so we're done. */
if (exec_used)
break;
/* Scan the insn for VGPRs defs or uses. The mode determines
what kind of exec is needed. */
subrtx_iterator::array_type array;
FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
{
const_rtx x = *iter;
if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
{
if (VECTOR_MODE_P (GET_MODE (x)))
{
new_exec = -1;
break;
}
else
new_exec = 1;
}
}
}
break;
}
if (new_exec && (!curr_exec_known || new_exec != curr_exec))
{
start_sequence ();
emit_move_insn (exec_reg, GEN_INT (new_exec));
rtx_insn *seq = get_insns ();
end_sequence ();
emit_insn_before (seq, insn);
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
new_exec, INSN_UID (insn));
curr_exec = new_exec;
curr_exec_explicit = false;
curr_exec_known = true;
}
else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "Exec already is %ld before insn %d.\n",
new_exec, INSN_UID (insn));
}
/* The state of the EXEC register is unknown after a
function call. */
if (CALL_P (insn))
curr_exec_known = false;
/* Handle explicit uses of EXEC. If the instruction is a partial
explicit definition of EXEC, then treat it as an explicit use of
EXEC as well. */
if (exec_used || exec_lo_def_p != exec_hi_def_p)
{
/* An instruction that explicitly uses EXEC should not also
implicitly define it. */
gcc_assert (!exec_used || !new_exec);
if (!curr_exec_known || !curr_exec_explicit)
{
/* Restore the previous explicitly defined value. */
curr_exec = gcn_restore_exec (insn, last_exec_def,
curr_exec, curr_exec_known,
last_exec_def_saved);
curr_exec_explicit = true;
curr_exec_known = true;
}
}
/* Handle explicit definitions of EXEC. */
if (exec_lo_def_p || exec_hi_def_p)
{
last_exec_def = insn;
last_exec_def_saved = false;
curr_exec = gcn_insn_exec_value (insn);
curr_exec_explicit = true;
curr_exec_known = true;
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file,
"Found %s definition of EXEC at insn %d.\n",
exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
INSN_UID (insn));
}
exec_live_on_entry = false;
}
COPY_REG_SET (&live, DF_LR_OUT (bb));
df_simulate_initialize_backwards (bb, &live);
/* If EXEC is live after the basic block, restore the value of EXEC
at the end of the block. */
if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
|| REGNO_REG_SET_P (&live, EXEC_HI_REG))
&& (!curr_exec_known || !curr_exec_explicit || exec_live_on_entry))
{
rtx_insn *end_insn = BB_END (bb);
/* If the instruction is not a jump instruction, do the restore
after the last instruction in the basic block. */
if (NONJUMP_INSN_P (end_insn))
end_insn = NEXT_INSN (end_insn);
gcn_restore_exec (end_insn, last_exec_def, curr_exec,
curr_exec_known, last_exec_def_saved);
}
}
CLEAR_REG_SET (&live);
/* "Manually Inserted Wait States (NOPs)."
GCN hardware detects most kinds of register dependencies, but there
are some exceptions documented in the ISA manual. This pass
detects the missed cases, and inserts the documented number of NOPs
required for correct execution. */
const int max_waits = 5;
struct ilist
{
rtx_insn *insn;
attr_unit unit;
attr_delayeduse delayeduse;
HARD_REG_SET writes;
HARD_REG_SET reads;
int age;
} back[max_waits];
int oldest = 0;
for (int i = 0; i < max_waits; i++)
back[i].insn = NULL;
rtx_insn *insn, *last_insn = NULL;
for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
{
if (!NONDEBUG_INSN_P (insn))
continue;
if (GET_CODE (PATTERN (insn)) == USE
|| GET_CODE (PATTERN (insn)) == CLOBBER)
continue;
attr_type itype = get_attr_type (insn);
attr_unit iunit = get_attr_unit (insn);
attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
HARD_REG_SET ireads, iwrites;
CLEAR_HARD_REG_SET (ireads);
CLEAR_HARD_REG_SET (iwrites);
note_stores (insn, record_hard_reg_sets, &iwrites);
note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
/* Scan recent previous instructions for dependencies not handled in
hardware. */
int nops_rqd = 0;
for (int i = oldest; i < oldest + max_waits; i++)
{
struct ilist *prev_insn = &back[i % max_waits];
if (!prev_insn->insn)
continue;
/* VALU writes SGPR followed by VMEM reading the same SGPR
requires 5 wait states. */
if ((prev_insn->age + nops_rqd) < 5
&& prev_insn->unit == UNIT_VECTOR
&& gcn_vmem_insn_p (itype))
{
HARD_REG_SET regs = prev_insn->writes & ireads;
if (hard_reg_set_intersect_p
(regs, reg_class_contents[(int) SGPR_REGS]))
nops_rqd = 5 - prev_insn->age;
}
/* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
requires 5 wait states. */
if ((prev_insn->age + nops_rqd) < 5
&& prev_insn->unit == UNIT_VECTOR
&& iunit == UNIT_VECTOR
&& ((hard_reg_set_intersect_p
(prev_insn->writes,
reg_class_contents[(int) EXEC_MASK_REG])
&& TEST_HARD_REG_BIT (ireads, EXECZ_REG))
||
(hard_reg_set_intersect_p
(prev_insn->writes,
reg_class_contents[(int) VCC_CONDITIONAL_REG])
&& TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
nops_rqd = 5 - prev_insn->age;
/* VALU writes SGPR/VCC followed by v_{read,write}lane using
SGPR/VCC as lane select requires 4 wait states. */
if ((prev_insn->age + nops_rqd) < 4
&& prev_insn->unit == UNIT_VECTOR
&& get_attr_laneselect (insn) == LANESELECT_YES)
{
HARD_REG_SET regs = prev_insn->writes & ireads;
if (hard_reg_set_intersect_p
(regs, reg_class_contents[(int) SGPR_REGS])
|| hard_reg_set_intersect_p
(regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
nops_rqd = 4 - prev_insn->age;
}
/* VALU writes VGPR followed by VALU_DPP reading that VGPR
requires 2 wait states. */
if ((prev_insn->age + nops_rqd) < 2
&& prev_insn->unit == UNIT_VECTOR
&& itype == TYPE_VOP_DPP)
{
HARD_REG_SET regs = prev_insn->writes & ireads;
if (hard_reg_set_intersect_p
(regs, reg_class_contents[(int) VGPR_REGS]))
nops_rqd = 2 - prev_insn->age;
}
/* Store that requires input registers are not overwritten by
following instruction. */
if ((prev_insn->age + nops_rqd) < 1
&& prev_insn->delayeduse == DELAYEDUSE_YES
&& ((hard_reg_set_intersect_p
(prev_insn->reads, iwrites))))
nops_rqd = 1 - prev_insn->age;
}
/* Insert the required number of NOPs. */
for (int i = nops_rqd; i > 0; i--)
emit_insn_after (gen_nop (), last_insn);
/* Age the previous instructions. We can also ignore writes to
registers subsequently overwritten. */
HARD_REG_SET written;
CLEAR_HARD_REG_SET (written);
for (int i = oldest + max_waits - 1; i > oldest; i--)
{
struct ilist *prev_insn = &back[i % max_waits];
/* Assume all instructions are equivalent to one "wait", the same
as s_nop. This is probably true for SALU, but not VALU (which
may take longer), so this is not optimal. However, AMD do
not publish the cycle times for instructions. */
prev_insn->age += 1 + nops_rqd;
written |= iwrites;
prev_insn->writes &= ~written;
}
/* Track the current instruction as a previous instruction. */
back[oldest].insn = insn;
back[oldest].unit = iunit;
back[oldest].delayeduse = idelayeduse;
back[oldest].writes = iwrites;
back[oldest].reads = ireads;
back[oldest].age = 0;
oldest = (oldest + 1) % max_waits;
last_insn = insn;
}
}
/* }}} */
/* {{{ OpenACC / OpenMP. */
#define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
#define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
#define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
/* Implement TARGET_GOACC_VALIDATE_DIMS.
Check the launch dimensions provided for an OpenACC compute
region, or routine. */
static bool
gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
unsigned /*used*/)
{
bool changed = false;
const int max_workers = 16;
/* The vector size must appear to be 64, to the user, unless this is a
SEQ routine. The real, internal value is always 1, which means use
autovectorization, but the user should not see that. */
if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
&& dims[GOMP_DIM_VECTOR] >= 0)
{
if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
&& dims[GOMP_DIM_VECTOR] != 64)
warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
OPT_Wopenacc_dims,
(dims[GOMP_DIM_VECTOR]
? G_("using %, ignoring %d")
: G_("using %, "
"ignoring runtime setting")),
dims[GOMP_DIM_VECTOR]);
dims[GOMP_DIM_VECTOR] = 1;
changed = true;
}
/* Check the num workers is not too large. */
if (dims[GOMP_DIM_WORKER] > max_workers)
{
warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
OPT_Wopenacc_dims,
"using %, ignoring %d",
max_workers, dims[GOMP_DIM_WORKER]);
dims[GOMP_DIM_WORKER] = max_workers;
changed = true;
}
/* Set global defaults. */
if (!decl)
{
dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
if (dims[GOMP_DIM_WORKER] < 0)
dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
if (dims[GOMP_DIM_GANG] < 0)
dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
changed = true;
}
return changed;
}
/* Helper function for oacc_dim_size instruction.
Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
rtx
gcn_oacc_dim_size (int dim)
{
if (dim < 0 || dim > 2)
error ("offload dimension out of range (%d)", dim);
/* Vectors are a special case. */
if (dim == 2)
return const1_rtx; /* Think of this as 1 times 64. */
static int offset[] = {
/* Offsets into dispatch packet. */
12, /* X dim = Gang / Team / Work-group. */
20, /* Z dim = Worker / Thread / Wavefront. */
16 /* Y dim = Vector / SIMD / Work-item. */
};
rtx addr = gen_rtx_PLUS (DImode,
gen_rtx_REG (DImode,
cfun->machine->args.
reg[DISPATCH_PTR_ARG]),
GEN_INT (offset[dim]));
return gen_rtx_MEM (SImode, addr);
}
/* Helper function for oacc_dim_pos instruction.
Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
rtx
gcn_oacc_dim_pos (int dim)
{
if (dim < 0 || dim > 2)
error ("offload dimension out of range (%d)", dim);
static const int reg[] = {
WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
};
int reg_num = cfun->machine->args.reg[reg[dim]];
/* The information must have been requested by the kernel. */
gcc_assert (reg_num >= 0);
return gen_rtx_REG (SImode, reg_num);
}
/* Implement TARGET_GOACC_FORK_JOIN. */
static bool
gcn_fork_join (gcall *call, const int dims[], bool is_fork)
{
tree arg = gimple_call_arg (call, 2);
unsigned axis = TREE_INT_CST_LOW (arg);
if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
return true;
return false;
}
/* Implement ???????
FIXME make this a real hook.
Adjust FNDECL such that options inherited from the host compiler
are made appropriate for the accelerator compiler. */
void
gcn_fixup_accel_lto_options (tree fndecl)
{
tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
if (!func_optimize)
return;
tree old_optimize
= build_optimization_node (&global_options, &global_options_set);
tree new_optimize;
/* If the function changed the optimization levels as well as
setting target options, start with the optimizations
specified. */
if (func_optimize != old_optimize)
cl_optimization_restore (&global_options, &global_options_set,
TREE_OPTIMIZATION (func_optimize));
gcn_option_override ();
/* The target attributes may also change some optimization flags,
so update the optimization options if necessary. */
new_optimize = build_optimization_node (&global_options,
&global_options_set);
if (old_optimize != new_optimize)
{
DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
cl_optimization_restore (&global_options, &global_options_set,
TREE_OPTIMIZATION (old_optimize));
}
}
/* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook. */
static void
gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
unsigned HOST_WIDE_INT *hi,
int ARG_UNUSED (dims[GOMP_DIM_MAX]),
unsigned HOST_WIDE_INT
ARG_UNUSED (private_size[GOMP_DIM_MAX]),
unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
{
*lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
/* !!! We can maybe use dims[] to estimate the maximum number of work
groups/wavefronts/etc. we will launch, and therefore tune the maximum
amount of LDS we should use. For now, use a minimal amount to try to
maximise occupancy. */
*hi = acc_lds_size;
machine_function *machfun = cfun->machine;
machfun->reduction_base = gang_private_size_opt;
machfun->reduction_limit
= gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
}
/* }}} */
/* {{{ ASM Output. */
/* Implement TARGET_ASM_FILE_START.
Print assembler file header text. */
static void
output_file_start (void)
{
const char *cpu;
bool use_xnack_attr = true;
bool use_sram_attr = true;
switch (gcn_arch)
{
case PROCESSOR_FIJI:
cpu = "gfx803";
#ifndef HAVE_GCN_XNACK_FIJI
use_xnack_attr = false;
#endif
use_sram_attr = false;
break;
case PROCESSOR_VEGA10:
cpu = "gfx900";
#ifndef HAVE_GCN_XNACK_GFX900
use_xnack_attr = false;
#endif
use_sram_attr = false;
break;
case PROCESSOR_VEGA20:
cpu = "gfx906";
#ifndef HAVE_GCN_XNACK_GFX906
use_xnack_attr = false;
#endif
use_sram_attr = false;
break;
case PROCESSOR_GFX908:
cpu = "gfx908";
#ifndef HAVE_GCN_XNACK_GFX908
use_xnack_attr = false;
#endif
#ifndef HAVE_GCN_SRAM_ECC_GFX908
use_sram_attr = false;
#endif
break;
default: gcc_unreachable ();
}
#if HAVE_GCN_ASM_V3_SYNTAX
const char *xnack = (flag_xnack ? "+xnack" : "");
const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : "");
#endif
#if HAVE_GCN_ASM_V4_SYNTAX
/* In HSACOv4 no attribute setting means the binary supports "any" hardware
configuration. In GCC binaries, this is true for SRAM ECC, but not
XNACK. */
const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
: flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
: "");
#endif
if (!use_xnack_attr)
xnack = "";
if (!use_sram_attr)
sram_ecc = "";
fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
cpu,
#if HAVE_GCN_ASM_V3_SYNTAX
xnack, sram_ecc
#endif
#ifdef HAVE_GCN_ASM_V4_SYNTAX
sram_ecc, xnack
#endif
);
}
/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
Print the initial definition of a function name.
For GCN kernel entry points this includes all the HSA meta-data, special
alignment constraints that don't apply to regular functions, and magic
comments that pass information to mkoffload. */
void
gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
{
int sgpr, vgpr;
bool xnack_enabled = false;
fputs ("\n\n", file);
if (cfun && cfun->machine && cfun->machine->normal_function)
{
fputs ("\t.type\t", file);
assemble_name (file, name);
fputs (",@function\n", file);
assemble_name (file, name);
fputs (":\n", file);
return;
}
/* Determine count of sgpr/vgpr registers by looking for last
one used. */
for (sgpr = 101; sgpr >= 0; sgpr--)
if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
break;
sgpr++;
for (vgpr = 255; vgpr >= 0; vgpr--)
if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
break;
vgpr++;
if (!leaf_function_p ())
{
/* We can't know how many registers function calls might use. */
if (vgpr < MAX_NORMAL_VGPR_COUNT)
vgpr = MAX_NORMAL_VGPR_COUNT;
if (sgpr < MAX_NORMAL_SGPR_COUNT)
sgpr = MAX_NORMAL_SGPR_COUNT;
}
fputs ("\t.rodata\n"
"\t.p2align\t6\n"
"\t.amdhsa_kernel\t", file);
assemble_name (file, name);
fputs ("\n", file);
int reg = FIRST_SGPR_REG;
for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
{
int reg_first = -1;
int reg_last;
if ((cfun->machine->args.requested & (1 << a))
&& (gcn_kernel_arg_types[a].fixed_regno < 0))
{
reg_first = reg;
reg_last = (reg_first
+ (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
/ UNITS_PER_WORD) - 1);
reg = reg_last + 1;
}
if (gcn_kernel_arg_types[a].header_pseudo)
{
fprintf (file, "\t %s%s\t%i",
(cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
gcn_kernel_arg_types[a].header_pseudo,
(cfun->machine->args.requested & (1 << a)) != 0);
if (reg_first != -1)
{
fprintf (file, " ; (");
for (int i = reg_first; i <= reg_last; ++i)
{
if (i != reg_first)
fprintf (file, ", ");
fprintf (file, "%s", reg_names[i]);
}
fprintf (file, ")");
}
fprintf (file, "\n");
}
else if (gcn_kernel_arg_types[a].fixed_regno >= 0
&& cfun->machine->args.requested & (1 << a))
fprintf (file, "\t ; %s\t%i (%s)\n",
gcn_kernel_arg_types[a].name,
(cfun->machine->args.requested & (1 << a)) != 0,
reg_names[gcn_kernel_arg_types[a].fixed_regno]);
}
fprintf (file, "\t .amdhsa_system_vgpr_workitem_id\t%i\n",
(cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
? 2
: cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
? 1 : 0);
fprintf (file,
"\t .amdhsa_next_free_vgpr\t%i\n"
"\t .amdhsa_next_free_sgpr\t%i\n"
"\t .amdhsa_reserve_vcc\t1\n"
"\t .amdhsa_reserve_flat_scratch\t0\n"
"\t .amdhsa_reserve_xnack_mask\t%i\n"
"\t .amdhsa_private_segment_fixed_size\t%i\n"
"\t .amdhsa_group_segment_fixed_size\t%u\n"
"\t .amdhsa_float_denorm_mode_32\t3\n"
"\t .amdhsa_float_denorm_mode_16_64\t3\n",
vgpr,
sgpr,
xnack_enabled,
/* workitem_private_segment_bytes_size needs to be
one 64th the wave-front stack size. */
stack_size_opt / 64,
LDS_SIZE);
fputs ("\t.end_amdhsa_kernel\n", file);
#if 1
/* The following is YAML embedded in assembler; tabs are not allowed. */
fputs (" .amdgpu_metadata\n"
" amdhsa.version:\n"
" - 1\n"
" - 0\n"
" amdhsa.kernels:\n"
" - .name: ", file);
assemble_name (file, name);
fputs ("\n .symbol: ", file);
assemble_name (file, name);
fprintf (file,
".kd\n"
" .kernarg_segment_size: %i\n"
" .kernarg_segment_align: %i\n"
" .group_segment_fixed_size: %u\n"
" .private_segment_fixed_size: %i\n"
" .wavefront_size: 64\n"
" .sgpr_count: %i\n"
" .vgpr_count: %i\n"
" .max_flat_workgroup_size: 1024\n",
cfun->machine->kernarg_segment_byte_size,
cfun->machine->kernarg_segment_alignment,
LDS_SIZE,
stack_size_opt / 64,
sgpr, vgpr);
fputs (" .end_amdgpu_metadata\n", file);
#endif
fputs ("\t.text\n", file);
fputs ("\t.align\t256\n", file);
fputs ("\t.type\t", file);
assemble_name (file, name);
fputs (",@function\n", file);
assemble_name (file, name);
fputs (":\n", file);
/* This comment is read by mkoffload. */
if (flag_openacc)
fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
}
/* Implement TARGET_ASM_SELECT_SECTION.
Return the section into which EXP should be placed. */
static section *
gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
{
if (TREE_TYPE (exp) != error_mark_node
&& TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
{
if (!DECL_P (exp))
return get_section (".lds_bss",
SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
NULL);
return get_named_section (exp, ".lds_bss", reloc);
}
return default_elf_select_section (exp, reloc, align);
}
/* Implement TARGET_ASM_FUNCTION_PROLOGUE.
Emits custom text into the assembler file at the head of each function. */
static void
gcn_target_asm_function_prologue (FILE *file)
{
machine_function *offsets = gcn_compute_frame_offsets ();
asm_fprintf (file, "\t; using %s addressing in function\n",
offsets->use_flat_addressing ? "flat" : "global");
if (offsets->normal_function)
{
asm_fprintf (file, "\t; frame pointer needed: %s\n",
offsets->need_frame_pointer ? "true" : "false");
asm_fprintf (file, "\t; lr needs saving: %s\n",
offsets->lr_needs_saving ? "true" : "false");
asm_fprintf (file, "\t; outgoing args size: %wd\n",
offsets->outgoing_args_size);
asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
asm_fprintf (file, "\t; callee save size: %wd\n",
offsets->callee_saves);
}
else
{
asm_fprintf (file, "\t; HSA kernel entry point\n");
asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
asm_fprintf (file, "\t; outgoing args size: %wd\n",
offsets->outgoing_args_size);
}
}
/* Helper function for print_operand and print_operand_address.
Print a register as the assembler requires, according to mode and name. */
static void
print_reg (FILE *file, rtx x)
{
machine_mode mode = GET_MODE (x);
if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
|| mode == HFmode || mode == SFmode
|| mode == V64SFmode || mode == V64SImode
|| mode == V64QImode || mode == V64HImode)
fprintf (file, "%s", reg_names[REGNO (x)]);
else if (mode == DImode || mode == V64DImode
|| mode == DFmode || mode == V64DFmode)
{
if (SGPR_REGNO_P (REGNO (x)))
fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
REGNO (x) - FIRST_SGPR_REG + 1);
else if (VGPR_REGNO_P (REGNO (x)))
fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
REGNO (x) - FIRST_VGPR_REG + 1);
else if (REGNO (x) == FLAT_SCRATCH_REG)
fprintf (file, "flat_scratch");
else if (REGNO (x) == EXEC_REG)
fprintf (file, "exec");
else if (REGNO (x) == VCC_LO_REG)
fprintf (file, "vcc");
else
fprintf (file, "[%s:%s]",
reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
}
else if (mode == TImode)
{
if (SGPR_REGNO_P (REGNO (x)))
fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
REGNO (x) - FIRST_SGPR_REG + 3);
else if (VGPR_REGNO_P (REGNO (x)))
fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
REGNO (x) - FIRST_VGPR_REG + 3);
else
gcc_unreachable ();
}
else
gcc_unreachable ();
}
/* Implement TARGET_SECTION_TYPE_FLAGS.
Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
static unsigned int
gcn_section_type_flags (tree decl, const char *name, int reloc)
{
if (strcmp (name, ".lds_bss") == 0)
return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
return default_section_type_flags (decl, name, reloc);
}
/* Helper function for gcn_asm_output_symbol_ref.
FIXME: This function is used to lay out gang-private variables in LDS
on a per-CU basis.
There may be cases in which gang-private variables in different compilation
units could clobber each other. In that case we should be relying on the
linker to lay out gang-private LDS space, but that doesn't appear to be
possible at present. */
static void
gcn_print_lds_decl (FILE *f, tree var)
{
int *offset;
if ((offset = lds_allocs.get (var)))
fprintf (f, "%u", (unsigned) *offset);
else
{
unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
tree type = TREE_TYPE (var);
unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
if (size > align && size > 4 && align < 8)
align = 8;
gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1));
lds_allocs.put (var, gang_private_hwm);
fprintf (f, "%u", gang_private_hwm);
gang_private_hwm += size;
if (gang_private_hwm > gang_private_size_opt)
error ("%d bytes of gang-private data-share memory exhausted"
" (increase with %<-mgang-private-size=%d%>, for example)",
gang_private_size_opt, gang_private_hwm);
}
}
/* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
void
gcn_asm_output_symbol_ref (FILE *file, rtx x)
{
tree decl;
if (cfun
&& (decl = SYMBOL_REF_DECL (x)) != 0
&& TREE_CODE (decl) == VAR_DECL
&& AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
{
/* LDS symbols (emitted using this hook) are only used at present
to propagate worker values from an active thread to neutered
threads. Use the same offset for each such block, but don't
use zero because null pointers are used to identify the active
thread in GOACC_single_copy_start calls. */
gcn_print_lds_decl (file, decl);
}
else
{
assemble_name (file, XSTR (x, 0));
/* FIXME: See above -- this condition is unreachable. */
if (cfun
&& (decl = SYMBOL_REF_DECL (x)) != 0
&& TREE_CODE (decl) == VAR_DECL
&& AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
fputs ("@abs32", file);
}
}
/* Implement TARGET_CONSTANT_ALIGNMENT.
Returns the alignment in bits of a constant that is being placed in memory.
CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
would ordinarily have. */
static HOST_WIDE_INT
gcn_constant_alignment (const_tree ARG_UNUSED (constant),
HOST_WIDE_INT basic_align)
{
return basic_align > 128 ? basic_align : 128;
}
/* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
void
print_operand_address (FILE *file, rtx mem)
{
gcc_assert (MEM_P (mem));
rtx reg;
rtx offset;
addr_space_t as = MEM_ADDR_SPACE (mem);
rtx addr = XEXP (mem, 0);
gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
if (AS_SCRATCH_P (as))
switch (GET_CODE (addr))
{
case REG:
print_reg (file, addr);
break;
case PLUS:
reg = XEXP (addr, 0);
offset = XEXP (addr, 1);
print_reg (file, reg);
if (GET_CODE (offset) == CONST_INT)
fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
else
abort ();
break;
default:
debug_rtx (addr);
abort ();
}
else if (AS_ANY_FLAT_P (as))
{
if (GET_CODE (addr) == REG)
print_reg (file, addr);
else
{
gcc_assert (TARGET_GCN5_PLUS);
print_reg (file, XEXP (addr, 0));
}
}
else if (AS_GLOBAL_P (as))
{
gcc_assert (TARGET_GCN5_PLUS);
rtx base = addr;
rtx vgpr_offset = NULL_RTX;
if (GET_CODE (addr) == PLUS)
{
base = XEXP (addr, 0);
if (GET_CODE (base) == PLUS)
{
/* (SGPR + VGPR) + CONST */
vgpr_offset = XEXP (base, 1);
base = XEXP (base, 0);
}
else
{
rtx offset = XEXP (addr, 1);
if (REG_P (offset))
/* SGPR + VGPR */
vgpr_offset = offset;
else if (CONST_INT_P (offset))
/* VGPR + CONST or SGPR + CONST */
;
else
output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
}
}
if (REG_P (base))
{
if (VGPR_REGNO_P (REGNO (base)))
print_reg (file, base);
else if (SGPR_REGNO_P (REGNO (base)))
{
/* The assembler requires a 64-bit VGPR pair here, even though
the offset should be only 32-bit. */
if (vgpr_offset == NULL_RTX)
/* In this case, the vector offset is zero, so we use the first
lane of v1, which is initialized to zero. */
{
if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
fprintf (file, "v1");
else
fprintf (file, "v[1:2]");
}
else if (REG_P (vgpr_offset)
&& VGPR_REGNO_P (REGNO (vgpr_offset)))
{
if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
fprintf (file, "v%d",
REGNO (vgpr_offset) - FIRST_VGPR_REG);
else
fprintf (file, "v[%d:%d]",
REGNO (vgpr_offset) - FIRST_VGPR_REG,
REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
}
else
output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
}
}
else
output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
}
else if (AS_ANY_DS_P (as))
switch (GET_CODE (addr))
{
case REG:
print_reg (file, addr);
break;
case PLUS:
reg = XEXP (addr, 0);
print_reg (file, reg);
break;
default:
debug_rtx (addr);
abort ();
}
else
switch (GET_CODE (addr))
{
case REG:
print_reg (file, addr);
fprintf (file, ", 0");
break;
case PLUS:
reg = XEXP (addr, 0);
offset = XEXP (addr, 1);
print_reg (file, reg);
fprintf (file, ", ");
if (GET_CODE (offset) == REG)
print_reg (file, reg);
else if (GET_CODE (offset) == CONST_INT)
fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
else
abort ();
break;
default:
debug_rtx (addr);
abort ();
}
}
/* Implement PRINT_OPERAND via gcn.h.
b - print operand size as untyped operand (b8/b16/b32/b64)
B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
i - print operand size as untyped operand (i16/b32/i64)
I - print operand size as SI/DI untyped operand(i32/b32/i64)
u - print operand size as untyped operand (u16/u32/u64)
U - print operand size as SI/DI untyped operand(u32/u64)
o - print operand size as memory access size for loads
(ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
s - print operand size as memory access size for stores
(byte/short/dword/dwordx2/wordx3/dwordx4)
C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
c - print inverse conditional code for s_cbranch
D - print conditional code for s_cmp (eq_u64/lg_u64...)
E - print conditional code for v_cmp (eq_u64/ne_u64...)
A - print address in formatting suitable for given address space.
O - print offset:n for data share operations.
^ - print "_co" suffix for GCN5 mnemonics
g - print "glc", if appropriate for given MEM
*/
void
print_operand (FILE *file, rtx x, int code)
{
int xcode = x ? GET_CODE (x) : 0;
bool invert = false;
switch (code)
{
/* Instructions have the following suffixes.
If there are two suffixes, the first is the destination type,
and the second is the source type.
B32 Bitfield (untyped data) 32-bit
B64 Bitfield (untyped data) 64-bit
F16 floating-point 16-bit
F32 floating-point 32-bit (IEEE 754 single-precision float)
F64 floating-point 64-bit (IEEE 754 double-precision float)
I16 signed 32-bit integer
I32 signed 32-bit integer
I64 signed 64-bit integer
U16 unsigned 32-bit integer
U32 unsigned 32-bit integer
U64 unsigned 64-bit integer */
/* Print operand size as untyped suffix. */
case 'b':
{
const char *s = "";
machine_mode mode = GET_MODE (x);
if (VECTOR_MODE_P (mode))
mode = GET_MODE_INNER (mode);
switch (GET_MODE_SIZE (mode))
{
case 1:
s = "_b8";
break;
case 2:
s = "_b16";
break;
case 4:
s = "_b32";
break;
case 8:
s = "_b64";
break;
default:
output_operand_lossage ("invalid operand %%xn code");
return;
}
fputs (s, file);
}
return;
case 'B':
{
const char *s = "";
machine_mode mode = GET_MODE (x);
if (VECTOR_MODE_P (mode))
mode = GET_MODE_INNER (mode);
switch (GET_MODE_SIZE (mode))
{
case 1:
case 2:
case 4:
s = "_b32";
break;
case 8:
s = "_b64";
break;
default:
output_operand_lossage ("invalid operand %%xn code");
return;
}
fputs (s, file);
}
return;
case 'e':
fputs ("sext(", file);
print_operand (file, x, 0);
fputs (")", file);
return;
case 'i':
case 'I':
case 'u':
case 'U':
{
bool signed_p = code == 'i';
bool min32_p = code == 'I' || code == 'U';
const char *s = "";
machine_mode mode = GET_MODE (x);
if (VECTOR_MODE_P (mode))
mode = GET_MODE_INNER (mode);
if (mode == VOIDmode)
switch (GET_CODE (x))
{
case CONST_INT:
s = signed_p ? "_i32" : "_u32";
break;
case CONST_DOUBLE:
s = "_f64";
break;
default:
output_operand_lossage ("invalid operand %%xn code");
return;
}
else if (FLOAT_MODE_P (mode))
switch (GET_MODE_SIZE (mode))
{
case 2:
s = "_f16";
break;
case 4:
s = "_f32";
break;
case 8:
s = "_f64";
break;
default:
output_operand_lossage ("invalid operand %%xn code");
return;
}
else if (min32_p)
switch (GET_MODE_SIZE (mode))
{
case 1:
case 2:
case 4:
s = signed_p ? "_i32" : "_u32";
break;
case 8:
s = signed_p ? "_i64" : "_u64";
break;
default:
output_operand_lossage ("invalid operand %%xn code");
return;
}
else
switch (GET_MODE_SIZE (mode))
{
case 1:
s = signed_p ? "_i8" : "_u8";
break;
case 2:
s = signed_p ? "_i16" : "_u16";
break;
case 4:
s = signed_p ? "_i32" : "_u32";
break;
case 8:
s = signed_p ? "_i64" : "_u64";
break;
default:
output_operand_lossage ("invalid operand %%xn code");
return;
}
fputs (s, file);
}
return;
/* Print operand size as untyped suffix. */
case 'o':
{
const char *s = 0;
switch (GET_MODE_SIZE (GET_MODE (x)))
{
case 1:
s = "_ubyte";
break;
case 2:
s = "_ushort";
break;
/* The following are full-vector variants. */
case 64:
s = "_ubyte";
break;
case 128:
s = "_ushort";
break;
}
if (s)
{
fputs (s, file);
return;
}
/* Fall-through - the other cases for 'o' are the same as for 's'. */
gcc_fallthrough();
}
case 's':
{
const char *s = "";
switch (GET_MODE_SIZE (GET_MODE (x)))
{
case 1:
s = "_byte";
break;
case 2:
s = "_short";
break;
case 4:
s = "_dword";
break;
case 8:
s = "_dwordx2";
break;
case 12:
s = "_dwordx3";
break;
case 16:
s = "_dwordx4";
break;
case 32:
s = "_dwordx8";
break;
case 64:
s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
break;
/* The following are full-vector variants. */
case 128:
s = "_short";
break;
case 256:
s = "_dword";
break;
case 512:
s = "_dwordx2";
break;
default:
output_operand_lossage ("invalid operand %%xn code");
return;
}
fputs (s, file);
}
return;
case 'A':
if (xcode != MEM)
{
output_operand_lossage ("invalid %%xn code");
return;
}
print_operand_address (file, x);
return;
case 'O':
{
if (xcode != MEM)
{
output_operand_lossage ("invalid %%xn code");
return;
}
if (AS_GDS_P (MEM_ADDR_SPACE (x)))
fprintf (file, " gds");
rtx x0 = XEXP (x, 0);
if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
{
gcc_assert (TARGET_GCN5_PLUS);
fprintf (file, ", ");
rtx base = x0;
rtx const_offset = NULL_RTX;
if (GET_CODE (base) == PLUS)
{
rtx offset = XEXP (x0, 1);
base = XEXP (x0, 0);
if (GET_CODE (base) == PLUS)
/* (SGPR + VGPR) + CONST */
/* Ignore the VGPR offset for this operand. */
base = XEXP (base, 0);
if (CONST_INT_P (offset))
const_offset = XEXP (x0, 1);
else if (REG_P (offset))
/* SGPR + VGPR */
/* Ignore the VGPR offset for this operand. */
;
else
output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
}
if (REG_P (base))
{
if (VGPR_REGNO_P (REGNO (base)))
/* The VGPR address is specified in the %A operand. */
fprintf (file, "off");
else if (SGPR_REGNO_P (REGNO (base)))
print_reg (file, base);
else
output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
}
else
output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
if (const_offset != NULL_RTX)
fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
INTVAL (const_offset));
return;
}
if (GET_CODE (x0) == REG)
return;
if (GET_CODE (x0) != PLUS)
{
output_operand_lossage ("invalid %%xn code");
return;
}
rtx val = XEXP (x0, 1);
if (GET_CODE (val) == CONST_VECTOR)
val = CONST_VECTOR_ELT (val, 0);
if (GET_CODE (val) != CONST_INT)
{
output_operand_lossage ("invalid %%xn code");
return;
}
fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
}
return;
case 'c':
invert = true;
/* Fall through. */
case 'C':
{
const char *s;
bool num = false;
if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
{
output_operand_lossage ("invalid %%xn code");
return;
}
switch (REGNO (XEXP (x, 0)))
{
case VCC_REG:
case VCCZ_REG:
s = "_vcc";
break;
case SCC_REG:
/* For some reason llvm-mc insists on scc0 instead of sccz. */
num = true;
s = "_scc";
break;
case EXECZ_REG:
s = "_exec";
break;
default:
output_operand_lossage ("invalid %%xn code");
return;
}
fputs (s, file);
if (xcode == (invert ? NE : EQ))
fputc (num ? '0' : 'z', file);
else
fputs (num ? "1" : "nz", file);
return;
}
case 'D':
{
const char *s;
bool cmp_signed = false;
switch (xcode)
{
case EQ:
s = "_eq_";
break;
case NE:
s = "_lg_";
break;
case LT:
s = "_lt_";
cmp_signed = true;
break;
case LE:
s = "_le_";
cmp_signed = true;
break;
case GT:
s = "_gt_";
cmp_signed = true;
break;
case GE:
s = "_ge_";
cmp_signed = true;
break;
case LTU:
s = "_lt_";
break;
case LEU:
s = "_le_";
break;
case GTU:
s = "_gt_";
break;
case GEU:
s = "_ge_";
break;
default:
output_operand_lossage ("invalid %%xn code");
return;
}
fputs (s, file);
fputc (cmp_signed ? 'i' : 'u', file);
machine_mode mode = GET_MODE (XEXP (x, 0));
if (mode == VOIDmode)
mode = GET_MODE (XEXP (x, 1));
/* If both sides are constants, then assume the instruction is in
SImode since s_cmp can only do integer compares. */
if (mode == VOIDmode)
mode = SImode;
switch (GET_MODE_SIZE (mode))
{
case 4:
s = "32";
break;
case 8:
s = "64";
break;
default:
output_operand_lossage ("invalid operand %%xn code");
return;
}
fputs (s, file);
return;
}
case 'E':
{
const char *s;
bool cmp_signed = false;
machine_mode mode = GET_MODE (XEXP (x, 0));
if (mode == VOIDmode)
mode = GET_MODE (XEXP (x, 1));
/* If both sides are constants, assume the instruction is in SFmode
if either operand is floating point, otherwise assume SImode. */
if (mode == VOIDmode)
{
if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
|| GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
mode = SFmode;
else
mode = SImode;
}
/* Use the same format code for vector comparisons. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
|| GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
mode = GET_MODE_INNER (mode);
bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
switch (xcode)
{
case EQ:
s = "_eq_";
break;
case NE:
s = float_p ? "_neq_" : "_ne_";
break;
case LT:
s = "_lt_";
cmp_signed = true;
break;
case LE:
s = "_le_";
cmp_signed = true;
break;
case GT:
s = "_gt_";
cmp_signed = true;
break;
case GE:
s = "_ge_";
cmp_signed = true;
break;
case LTU:
s = "_lt_";
break;
case LEU:
s = "_le_";
break;
case GTU:
s = "_gt_";
break;
case GEU:
s = "_ge_";
break;
case ORDERED:
s = "_o_";
break;
case UNORDERED:
s = "_u_";
break;
case UNEQ:
s = "_nlg_";
break;
case UNGE:
s = "_nlt_";
break;
case UNGT:
s = "_nle_";
break;
case UNLE:
s = "_ngt_";
break;
case UNLT:
s = "_nge_";
break;
case LTGT:
s = "_lg_";
break;
default:
output_operand_lossage ("invalid %%xn code");
return;
}
fputs (s, file);
fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
switch (GET_MODE_SIZE (mode))
{
case 1:
output_operand_lossage ("operand %%xn code invalid for QImode");
return;
case 2:
s = "16";
break;
case 4:
s = "32";
break;
case 8:
s = "64";
break;
default:
output_operand_lossage ("invalid operand %%xn code");
return;
}
fputs (s, file);
return;
}
case 'L':
print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
return;
case 'H':
print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
return;
case 'R':
/* Print a scalar register number as an integer. Temporary hack. */
gcc_assert (REG_P (x));
fprintf (file, "%u", (int) REGNO (x));
return;
case 'V':
/* Print a vector register number as an integer. Temporary hack. */
gcc_assert (REG_P (x));
fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
return;
case 0:
if (xcode == REG)
print_reg (file, x);
else if (xcode == MEM)
output_address (GET_MODE (x), x);
else if (xcode == CONST_INT)
fprintf (file, "%i", (int) INTVAL (x));
else if (xcode == CONST_VECTOR)
print_operand (file, CONST_VECTOR_ELT (x, 0), code);
else if (xcode == CONST_DOUBLE)
{
const char *str;
switch (gcn_inline_fp_constant_p (x, false))
{
case 240:
str = "0.5";
break;
case 241:
str = "-0.5";
break;
case 242:
str = "1.0";
break;
case 243:
str = "-1.0";
break;
case 244:
str = "2.0";
break;
case 245:
str = "-2.0";
break;
case 246:
str = "4.0";
break;
case 247:
str = "-4.0";
break;
case 248:
str = "1/pi";
break;
default:
rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
? DImode : SImode,
x, GET_MODE (x), 0);
if (x)
print_operand (file, ix, code);
else
output_operand_lossage ("invalid fp constant");
return;
break;
}
fprintf (file, str);
return;
}
else
output_addr_const (file, x);
return;
case '^':
if (TARGET_GCN5_PLUS)
fputs ("_co", file);
return;
case 'g':
gcc_assert (xcode == MEM);
if (MEM_VOLATILE_P (x))
fputs (" glc", file);
return;
default:
output_operand_lossage ("invalid %%xn code");
}
gcc_unreachable ();
}
/* Implement DBX_REGISTER_NUMBER macro.
Return the DWARF register number that corresponds to the GCC internal
REGNO. */
unsigned int
gcn_dwarf_register_number (unsigned int regno)
{
/* Registers defined in DWARF. */
if (regno == EXEC_LO_REG)
return 17;
/* We need to use a more complex DWARF expression for this
else if (regno == EXEC_HI_REG)
return 17; */
else if (regno == VCC_LO_REG)
return 768;
/* We need to use a more complex DWARF expression for this
else if (regno == VCC_HI_REG)
return 768; */
else if (regno == SCC_REG)
return 128;
else if (regno == DWARF_LINK_REGISTER)
return 16;
else if (SGPR_REGNO_P (regno))
{
if (regno - FIRST_SGPR_REG < 64)
return (regno - FIRST_SGPR_REG + 32);
else
return (regno - FIRST_SGPR_REG + 1024);
}
else if (VGPR_REGNO_P (regno))
return (regno - FIRST_VGPR_REG + 2560);
/* Otherwise, there's nothing sensible to do. */
return regno + 100000;
}
/* Implement TARGET_DWARF_REGISTER_SPAN.
DImode and Vector DImode require additional registers. */
static rtx
gcn_dwarf_register_span (rtx rtl)
{
machine_mode mode = GET_MODE (rtl);
if (VECTOR_MODE_P (mode))
mode = GET_MODE_INNER (mode);
if (GET_MODE_SIZE (mode) != 8)
return NULL_RTX;
unsigned regno = REGNO (rtl);
if (regno == DWARF_LINK_REGISTER)
return NULL_RTX;
rtx p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
XVECEXP (p, 0, 0) = gen_rtx_REG (SImode, regno);
XVECEXP (p, 0, 1) = gen_rtx_REG (SImode, regno + 1);
return p;
}
/* }}} */
/* {{{ TARGET hook overrides. */
#undef TARGET_ADDR_SPACE_ADDRESS_MODE
#define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
#undef TARGET_ADDR_SPACE_DEBUG
#define TARGET_ADDR_SPACE_DEBUG gcn_addr_space_debug
#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
gcn_addr_space_legitimate_address_p
#undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
#undef TARGET_ADDR_SPACE_POINTER_MODE
#define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
#undef TARGET_ADDR_SPACE_SUBSET_P
#define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
#undef TARGET_ADDR_SPACE_CONVERT
#define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
#undef TARGET_ARG_PARTIAL_BYTES
#define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
#undef TARGET_ASM_ALIGNED_DI_OP
#define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
#undef TARGET_ASM_FILE_START
#define TARGET_ASM_FILE_START output_file_start
#undef TARGET_ASM_FUNCTION_PROLOGUE
#define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
#undef TARGET_ASM_SELECT_SECTION
#define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
#define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
#undef TARGET_ATTRIBUTE_TABLE
#define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
#undef TARGET_BUILTIN_DECL
#define TARGET_BUILTIN_DECL gcn_builtin_decl
#undef TARGET_CAN_CHANGE_MODE_CLASS
#define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
#undef TARGET_CAN_ELIMINATE
#define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
#undef TARGET_CANNOT_COPY_INSN_P
#define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
#undef TARGET_CLASS_LIKELY_SPILLED_P
#define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
#undef TARGET_CLASS_MAX_NREGS
#define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
#undef TARGET_CONDITIONAL_REGISTER_USAGE
#define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
#undef TARGET_CONSTANT_ALIGNMENT
#define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
#undef TARGET_DEBUG_UNWIND_INFO
#define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
#undef TARGET_DWARF_REGISTER_SPAN
#define TARGET_DWARF_REGISTER_SPAN gcn_dwarf_register_span
#undef TARGET_EMUTLS_VAR_INIT
#define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
#undef TARGET_EXPAND_BUILTIN
#define TARGET_EXPAND_BUILTIN gcn_expand_builtin
#undef TARGET_FRAME_POINTER_REQUIRED
#define TARGET_FRAME_POINTER_REQUIRED gcn_frame_pointer_rqd
#undef TARGET_FUNCTION_ARG
#undef TARGET_FUNCTION_ARG_ADVANCE
#define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
#define TARGET_FUNCTION_ARG gcn_function_arg
#undef TARGET_FUNCTION_VALUE
#define TARGET_FUNCTION_VALUE gcn_function_value
#undef TARGET_FUNCTION_VALUE_REGNO_P
#define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
#undef TARGET_GIMPLIFY_VA_ARG_EXPR
#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
#define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
#undef TARGET_GOACC_ADJUST_PRIVATE_DECL
#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
#undef TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
#define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \
gcn_goacc_create_worker_broadcast_record
#undef TARGET_GOACC_FORK_JOIN
#define TARGET_GOACC_FORK_JOIN gcn_fork_join
#undef TARGET_GOACC_REDUCTION
#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
#undef TARGET_GOACC_VALIDATE_DIMS
#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
#undef TARGET_GOACC_SHARED_MEM_LAYOUT
#define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
#undef TARGET_HARD_REGNO_MODE_OK
#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
#undef TARGET_HARD_REGNO_NREGS
#define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
#undef TARGET_INIT_BUILTINS
#define TARGET_INIT_BUILTINS gcn_init_builtins
#undef TARGET_INIT_LIBFUNCS
#define TARGET_INIT_LIBFUNCS gcn_init_libfuncs
#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
gcn_ira_change_pseudo_allocno_class
#undef TARGET_LEGITIMATE_CONSTANT_P
#define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
#undef TARGET_LRA_P
#define TARGET_LRA_P hook_bool_void_true
#undef TARGET_MACHINE_DEPENDENT_REORG
#define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
#undef TARGET_MEMORY_MOVE_COST
#define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
#undef TARGET_MODES_TIEABLE_P
#define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
#undef TARGET_OPTION_OVERRIDE
#define TARGET_OPTION_OVERRIDE gcn_option_override
#undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
#define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
gcn_pretend_outgoing_varargs_named
#undef TARGET_PROMOTE_FUNCTION_MODE
#define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
#undef TARGET_REGISTER_MOVE_COST
#define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
#undef TARGET_RETURN_IN_MEMORY
#define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
#undef TARGET_RTX_COSTS
#define TARGET_RTX_COSTS gcn_rtx_costs
#undef TARGET_SECONDARY_RELOAD
#define TARGET_SECONDARY_RELOAD gcn_secondary_reload
#undef TARGET_SECTION_TYPE_FLAGS
#define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
#undef TARGET_SCALAR_MODE_SUPPORTED_P
#define TARGET_SCALAR_MODE_SUPPORTED_P gcn_scalar_mode_supported_p
#undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
gcn_small_register_classes_for_mode_p
#undef TARGET_SPILL_CLASS
#define TARGET_SPILL_CLASS gcn_spill_class
#undef TARGET_STRICT_ARGUMENT_NAMING
#define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
#undef TARGET_TRAMPOLINE_INIT
#define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
#undef TARGET_TRULY_NOOP_TRUNCATION
#define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
#undef TARGET_VECTORIZE_GET_MASK_MODE
#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
gcn_preferred_vector_alignment
#undef TARGET_VECTORIZE_RELATED_MODE
#define TARGET_VECTORIZE_RELATED_MODE gcn_related_vector_mode
#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
gcn_vectorize_support_vector_misalignment
#undef TARGET_VECTORIZE_VEC_PERM_CONST
#define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
gcn_vector_alignment_reachable
#undef TARGET_VECTOR_MODE_SUPPORTED_P
#define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-gcn.h"
/* }}} */