reference, declarationdefinition
definition → references, declarations, derived classes, virtual overrides
reference to multiple definitions → definitions
unreferenced
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
  227
  228
  229
  230
  231
  232
  233
  234
  235
  236
  237
  238
  239
  240
  241
  242
  243
  244
  245
  246
  247
  248
  249
  250
  251
  252
  253
  254
  255
  256
  257
  258
  259
  260
  261
  262
  263
  264
  265
  266
  267
  268
  269
  270
  271
  272
  273
  274
  275
  276
  277
  278
  279
  280
  281
  282
  283
  284
  285
  286
  287
  288
  289
  290
  291
  292
  293
  294
  295
  296
  297
  298
  299
  300
  301
  302
  303
  304
  305
  306
  307
  308
  309
  310
  311
  312
  313
  314
  315
  316
  317
  318
  319
  320
  321
  322
//===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
/// This file implements a TargetTransformInfo analysis pass specific to the
/// Hexagon target machine. It uses the target's detailed information to provide
/// more precise answers to certain TTI queries, while letting the target
/// independent and default TTI implementations handle the rest.
///
//===----------------------------------------------------------------------===//

#include "HexagonTargetTransformInfo.h"
#include "HexagonSubtarget.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/User.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"

using namespace llvm;

#define DEBUG_TYPE "hexagontti"

static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
  cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));

static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
  cl::init(true), cl::Hidden,
  cl::desc("Control lookup table emission on Hexagon target"));

// Constant "cost factor" to make floating point operations more expensive
// in terms of vectorization cost. This isn't the best way, but it should
// do. Ultimately, the cost should use cycles.
static const unsigned FloatFactor = 4;

bool HexagonTTIImpl::useHVX() const {
  return ST.useHVXOps() && HexagonAutoHVX;
}

bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
  assert(VecTy->isVectorTy());
  if (cast<VectorType>(VecTy)->isScalable())
    return false;
  // Avoid types like <2 x i32*>.
  if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
    return false;
  EVT VecVT = EVT::getEVT(VecTy);
  if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64)
    return false;
  if (ST.isHVXVectorType(VecVT.getSimpleVT()))
    return true;
  auto Action = TLI.getPreferredVectorAction(VecVT.getSimpleVT());
  return Action == TargetLoweringBase::TypeWidenVector;
}

unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const {
  if (Ty->isVectorTy())
    return Ty->getVectorNumElements();
  assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) &&
         "Expecting scalar type");
  return 1;
}

TargetTransformInfo::PopcntSupportKind
HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
  // Return fast hardware support as every input < 64 bits will be promoted
  // to 64 bits.
  return TargetTransformInfo::PSK_FastHardware;
}

// The Hexagon target can unroll loops with run-time trip counts.
void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                             TTI::UnrollingPreferences &UP) {
  UP.Runtime = UP.Partial = true;
  // Only try to peel innermost loops with small runtime trip counts.
  if (L && L->empty() && canPeel(L) &&
      SE.getSmallConstantTripCount(L) == 0 &&
      SE.getSmallConstantMaxTripCount(L) > 0 &&
      SE.getSmallConstantMaxTripCount(L) <= 5) {
    UP.PeelCount = 2;
  }
}

bool HexagonTTIImpl::shouldFavorPostInc() const {
  return true;
}

/// --- Vector TTI begin ---

unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const {
  if (Vector)
    return useHVX() ? 32 : 0;
  return 32;
}

unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) {
  return useHVX() ? 2 : 0;
}

unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const {
  return Vector ? getMinVectorRegisterBitWidth() : 32;
}

unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const {
  return useHVX() ? ST.getVectorLength()*8 : 0;
}

unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
  return (8 * ST.getVectorLength()) / ElemWidth;
}

unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
      bool Extract) {
  return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
}

unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
      ArrayRef<const Value*> Args, unsigned VF) {
  return BaseT::getOperandsScalarizationOverhead(Args, VF);
}

unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
      ArrayRef<Type*> Tys) {
  return BaseT::getCallInstrCost(F, RetTy, Tys);
}

unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
      ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
}

unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
      ArrayRef<Type*> Tys, FastMathFlags FMF,
      unsigned ScalarizationCostPassed) {
  if (ID == Intrinsic::bswap) {
    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, RetTy);
    return LT.first + 2;
  }
  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
                                      ScalarizationCostPassed);
}

unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
      ScalarEvolution *SE, const SCEV *S) {
  return 0;
}

unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                         MaybeAlign Alignment,
                                         unsigned AddressSpace,
                                         const Instruction *I) {
  assert(Opcode == Instruction::Load || Opcode == Instruction::Store);
  if (Opcode == Instruction::Store)
    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);

  if (Src->isVectorTy()) {
    VectorType *VecTy = cast<VectorType>(Src);
    unsigned VecWidth = VecTy->getBitWidth();
    if (useHVX() && isTypeForHVX(VecTy)) {
      unsigned RegWidth = getRegisterBitWidth(true);
      assert(RegWidth && "Non-zero vector register width expected");
      // Cost of HVX loads.
      if (VecWidth % RegWidth == 0)
        return VecWidth / RegWidth;
      // Cost of constructing HVX vector from scalar loads
      const Align RegAlign(RegWidth / 8);
      if (!Alignment || *Alignment > RegAlign)
        Alignment = RegAlign;
      assert(Alignment);
      unsigned AlignWidth = 8 * Alignment->value();
      unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
      return 3 * NumLoads;
    }

    // Non-HVX vectors.
    // Add extra cost for floating point types.
    unsigned Cost =
        VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1;

    // At this point unspecified alignment is considered as Align::None().
    const Align BoundAlignment = std::min(Alignment.valueOrOne(), Align(8));
    unsigned AlignWidth = 8 * BoundAlignment.value();
    unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
    if (Alignment == Align(4) || Alignment == Align(8))
      return Cost * NumLoads;
    // Loads of less than 32 bits will need extra inserts to compose a vector.
    assert(BoundAlignment <= Align(8));
    unsigned LogA = Log2(BoundAlignment);
    return (3 - LogA) * Cost * NumLoads;
  }

  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
}

unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode,
      Type *Src, unsigned Alignment, unsigned AddressSpace) {
  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
}

unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
      int Index, Type *SubTp) {
  return 1;
}

unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
      Value *Ptr, bool VariableMask, unsigned Alignment) {
  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                       Alignment);
}

unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
      Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
      unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
      bool UseMaskForGaps) {
  if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                             Alignment, AddressSpace,
                                             UseMaskForCond, UseMaskForGaps);
  return getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
                         nullptr);
}

unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
      Type *CondTy, const Instruction *I) {
  if (ValTy->isVectorTy()) {
    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
    if (Opcode == Instruction::FCmp)
      return LT.first + FloatFactor * getTypeNumElements(ValTy);
  }
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}

unsigned HexagonTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
      TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
      TTI::OperandValueProperties Opd1PropInfo,
      TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value*> Args) {
  if (Ty->isVectorTy()) {
    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty);
    if (LT.second.isFloatingPoint())
      return LT.first + FloatFactor * getTypeNumElements(Ty);
  }
  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
                                       Opd1PropInfo, Opd2PropInfo, Args);
}

unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
      Type *SrcTy, const Instruction *I) {
  if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
    unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0;
    unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0;

    std::pair<int, MVT> SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy);
    std::pair<int, MVT> DstLT = TLI.getTypeLegalizationCost(DL, DstTy);
    return std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
  }
  return 1;
}

unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
      unsigned Index) {
  Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
                                   : Val;
  if (Opcode == Instruction::InsertElement) {
    // Need two rotations for non-zero index.
    unsigned Cost = (Index != 0) ? 2 : 0;
    if (ElemTy->isIntegerTy(32))
      return Cost;
    // If it's not a 32-bit value, there will need to be an extract.
    return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index);
  }

  if (Opcode == Instruction::ExtractElement)
    return 2;

  return 1;
}

/// --- Vector TTI end ---

unsigned HexagonTTIImpl::getPrefetchDistance() const {
  return ST.getL1PrefetchDistance();
}

unsigned HexagonTTIImpl::getCacheLineSize() const {
  return ST.getL1CacheLineSize();
}

int HexagonTTIImpl::getUserCost(const User *U,
                                ArrayRef<const Value *> Operands) {
  auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
    if (!CI->isIntegerCast())
      return false;
    // Only extensions from an integer type shorter than 32-bit to i32
    // can be folded into the load.
    const DataLayout &DL = getDataLayout();
    unsigned SBW = DL.getTypeSizeInBits(CI->getSrcTy());
    unsigned DBW = DL.getTypeSizeInBits(CI->getDestTy());
    if (DBW != 32 || SBW >= DBW)
      return false;

    const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0));
    // Technically, this code could allow multiple uses of the load, and
    // check if all the uses are the same extension operation, but this
    // should be sufficient for most cases.
    return LI && LI->hasOneUse();
  };

  if (const CastInst *CI = dyn_cast<const CastInst>(U))
    if (isCastFoldedIntoLoad(CI))
      return TargetTransformInfo::TCC_Free;
  return BaseT::getUserCost(U, Operands);
}

bool HexagonTTIImpl::shouldBuildLookupTables() const {
  return EmitLookupTables;
}