reference, declarationdefinition
definition → references, declarations, derived classes, virtual overrides
reference to multiple definitions → definitions
unreferenced
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=avx | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64

define void @big_nonzero_16_bytes(i32* nocapture %a) {
; X32-LABEL: big_nonzero_16_bytes:
; X32:       # %bb.0:
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,3,4]
; X32-NEXT:    vmovups %xmm0, (%eax)
; X32-NEXT:    retl
;
; X64-LABEL: big_nonzero_16_bytes:
; X64:       # %bb.0:
; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,3,4]
; X64-NEXT:    vmovups %xmm0, (%rdi)
; X64-NEXT:    retq
  %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
  %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3

  store i32 1, i32* %a, align 4
  store i32 2, i32* %arrayidx1, align 4
  store i32 3, i32* %arrayidx2, align 4
  store i32 4, i32* %arrayidx3, align 4
  ret void
}

; TODO: We assumed that two 64-bit stores were better than 1 vector load and 1 vector store.
; But if the 64-bit constants can't be represented as sign-extended 32-bit constants, then
; it takes extra instructions to do this in scalar.

define void @big_nonzero_16_bytes_big64bit_constants(i64* nocapture %a) {
; X32-LABEL: big_nonzero_16_bytes_big64bit_constants:
; X32:       # %bb.0:
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,3]
; X32-NEXT:    vmovups %xmm0, (%eax)
; X32-NEXT:    retl
;
; X64-LABEL: big_nonzero_16_bytes_big64bit_constants:
; X64:       # %bb.0:
; X64-NEXT:    movabsq $4294967297, %rax # imm = 0x100000001
; X64-NEXT:    movq %rax, (%rdi)
; X64-NEXT:    movabsq $12884901889, %rax # imm = 0x300000001
; X64-NEXT:    movq %rax, 8(%rdi)
; X64-NEXT:    retq
  %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1

  store i64 4294967297, i64* %a
  store i64 12884901889, i64* %arrayidx1
  ret void
}

; Splats may be an opportunity to use a broadcast op.

define void @big_nonzero_32_bytes_splat(i32* nocapture %a) {
; X32-LABEL: big_nonzero_32_bytes_splat:
; X32:       # %bb.0:
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
; X32-NEXT:    vmovups %ymm0, (%eax)
; X32-NEXT:    vzeroupper
; X32-NEXT:    retl
;
; X64-LABEL: big_nonzero_32_bytes_splat:
; X64:       # %bb.0:
; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
; X64-NEXT:    vmovups %ymm0, (%rdi)
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
  %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
  %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3
  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 4
  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 5
  %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 6
  %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 7

  store i32 42, i32* %a, align 4
  store i32 42, i32* %arrayidx1, align 4
  store i32 42, i32* %arrayidx2, align 4
  store i32 42, i32* %arrayidx3, align 4
  store i32 42, i32* %arrayidx4, align 4
  store i32 42, i32* %arrayidx5, align 4
  store i32 42, i32* %arrayidx6, align 4
  store i32 42, i32* %arrayidx7, align 4
  ret void
}

; Verify that we choose the best-sized store(s) for each chunk.

define void @big_nonzero_63_bytes(i8* nocapture %a) {
; X32-LABEL: big_nonzero_63_bytes:
; X32:       # %bb.0:
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0]
; X32-NEXT:    vmovups %ymm0, (%eax)
; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [5,0,6,0]
; X32-NEXT:    vmovups %xmm0, 32(%eax)
; X32-NEXT:    movl $0, 52(%eax)
; X32-NEXT:    movl $7, 48(%eax)
; X32-NEXT:    movl $8, 56(%eax)
; X32-NEXT:    movw $9, 60(%eax)
; X32-NEXT:    movb $10, 62(%eax)
; X32-NEXT:    vzeroupper
; X32-NEXT:    retl
;
; X64-LABEL: big_nonzero_63_bytes:
; X64:       # %bb.0:
; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,3,4]
; X64-NEXT:    vmovups %ymm0, (%rdi)
; X64-NEXT:    movq $5, 32(%rdi)
; X64-NEXT:    movq $6, 40(%rdi)
; X64-NEXT:    movq $7, 48(%rdi)
; X64-NEXT:    movl $8, 56(%rdi)
; X64-NEXT:    movw $9, 60(%rdi)
; X64-NEXT:    movb $10, 62(%rdi)
; X64-NEXT:    vzeroupper
; X64-NEXT:    retq
  %a8 = bitcast i8* %a to i64*
  %arrayidx8 = getelementptr inbounds i64, i64* %a8, i64 1
  %arrayidx16 = getelementptr inbounds i64, i64* %a8, i64 2
  %arrayidx24 = getelementptr inbounds i64, i64* %a8, i64 3
  %arrayidx32 = getelementptr inbounds i64, i64* %a8, i64 4
  %arrayidx40 = getelementptr inbounds i64, i64* %a8, i64 5
  %arrayidx48 = getelementptr inbounds i64, i64* %a8, i64 6
  %a4 = bitcast i8* %a to i32*
  %arrayidx56 = getelementptr inbounds i32, i32* %a4, i64 14
  %a2 = bitcast i8* %a to i16*
  %arrayidx60 = getelementptr inbounds i16, i16* %a2, i64 30
  %arrayidx62 = getelementptr inbounds i8, i8* %a, i64 62

  store i64 1, i64* %a8
  store i64 2, i64* %arrayidx8
  store i64 3, i64* %arrayidx16
  store i64 4, i64* %arrayidx24
  store i64 5, i64* %arrayidx32
  store i64 6, i64* %arrayidx40
  store i64 7, i64* %arrayidx48
  store i32 8, i32* %arrayidx56
  store i16 9, i16* %arrayidx60
  store i8 10, i8* %arrayidx62
  ret void
}