Skip to content

Commit ec2d6e2

Browse files
authored
Unrolled build for #141538
Rollup merge of #141538 - folkertdev:systemv-x86_64-va_arg, r=workingjubilee implement `va_arg` for x86_64 systemv tracking issue: #44930 Turns out LLVM's `va_arg` is also unreliable for this target. llvm/llvm-project#141361 So, like clang, we implement our own. I used - the spec at https://gitlab.com/x86-psABIs/x86-64-ABI - the clang implementation at https://github.com/llvm/llvm-project/blob/9a440f84773c56d3803f330774acb2b4f471d5b4/clang/lib/CodeGen/Targets/X86.cpp#L3041 We can take a bunch of shortcuts because the return type of `va_list` must implement `VaArgSafe`. I also extended some of the tests, because up to 11 floats can be stored in the `reg_safe_area` for this calling convention. r? `@workingjubilee` `@rustbot` label +F-c_variadic try-job: x86_64-apple-1
2 parents 1c0849d + 94cc726 commit ec2d6e2

File tree

3 files changed

+331
-11
lines changed

3 files changed

+331
-11
lines changed

compiler/rustc_codegen_llvm/src/va_arg.rs

Lines changed: 319 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
use rustc_abi::{Align, Endian, HasDataLayout, Size};
1+
use rustc_abi::{Align, BackendRepr, Endian, HasDataLayout, Primitive, Size, TyAndLayout};
2+
use rustc_codegen_ssa::MemFlags;
23
use rustc_codegen_ssa::common::IntPredicate;
34
use rustc_codegen_ssa::mir::operand::OperandRef;
4-
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods, ConstCodegenMethods};
5+
use rustc_codegen_ssa::traits::{
6+
BaseTypeCodegenMethods, BuilderMethods, ConstCodegenMethods, LayoutTypeCodegenMethods,
7+
};
58
use rustc_middle::ty::Ty;
69
use rustc_middle::ty::layout::{HasTyCtxt, LayoutOf};
710

@@ -303,6 +306,313 @@ fn emit_s390x_va_arg<'ll, 'tcx>(
303306
bx.load(val_type, val_addr, layout.align.abi)
304307
}
305308

309+
fn emit_x86_64_sysv64_va_arg<'ll, 'tcx>(
310+
bx: &mut Builder<'_, 'll, 'tcx>,
311+
list: OperandRef<'tcx, &'ll Value>,
312+
target_ty: Ty<'tcx>,
313+
) -> &'ll Value {
314+
let dl = bx.cx.data_layout();
315+
316+
// Implementation of the systemv x86_64 ABI calling convention for va_args, see
317+
// https://gitlab.com/x86-psABIs/x86-64-ABI (section 3.5.7). This implementation is heavily
318+
// based on the one in clang.
319+
320+
// We're able to take some shortcuts because the return type of `va_arg` must implement the
321+
// `VaArgSafe` trait. Currently, only pointers, f64, i32, u32, i64 and u64 implement this trait.
322+
323+
// typedef struct __va_list_tag {
324+
// unsigned int gp_offset;
325+
// unsigned int fp_offset;
326+
// void *overflow_arg_area;
327+
// void *reg_save_area;
328+
// } va_list[1];
329+
let va_list_addr = list.immediate();
330+
331+
// Peel off any newtype wrappers.
332+
//
333+
// The "C" ABI does not unwrap newtypes (see `ReprOptions::inhibit_newtype_abi_optimization`).
334+
// Here, we do actually want the unwrapped representation, because that is how LLVM/Clang
335+
// pass such types to variadic functions.
336+
//
337+
// An example of a type that must be unwrapped is `Foo` below. Without the unwrapping, it has
338+
// `BackendRepr::Memory`, but we need it to be `BackendRepr::Scalar` to generate correct code.
339+
//
340+
// ```
341+
// #[repr(C)]
342+
// struct Empty;
343+
//
344+
// #[repr(C)]
345+
// struct Foo([Empty; 8], i32);
346+
// ```
347+
let layout = {
348+
let mut layout = bx.cx.layout_of(target_ty);
349+
350+
while let Some((_, inner)) = layout.non_1zst_field(bx.cx) {
351+
layout = inner;
352+
}
353+
354+
layout
355+
};
356+
357+
// AMD64-ABI 3.5.7p5: Step 1. Determine whether type may be passed
358+
// in the registers. If not go to step 7.
359+
360+
// AMD64-ABI 3.5.7p5: Step 2. Compute num_gp to hold the number of
361+
// general purpose registers needed to pass type and num_fp to hold
362+
// the number of floating point registers needed.
363+
364+
let mut num_gp_registers = 0;
365+
let mut num_fp_registers = 0;
366+
367+
let mut registers_for_primitive = |p| match p {
368+
Primitive::Int(integer, _is_signed) => {
369+
num_gp_registers += integer.size().bytes().div_ceil(8) as u32;
370+
}
371+
Primitive::Float(float) => {
372+
num_fp_registers += float.size().bytes().div_ceil(16) as u32;
373+
}
374+
Primitive::Pointer(_) => {
375+
num_gp_registers += 1;
376+
}
377+
};
378+
379+
match layout.layout.backend_repr() {
380+
BackendRepr::Scalar(scalar) => {
381+
registers_for_primitive(scalar.primitive());
382+
}
383+
BackendRepr::ScalarPair(scalar1, scalar2) => {
384+
registers_for_primitive(scalar1.primitive());
385+
registers_for_primitive(scalar2.primitive());
386+
}
387+
BackendRepr::SimdVector { .. } => {
388+
// Because no instance of VaArgSafe uses a non-scalar `BackendRepr`.
389+
unreachable!(
390+
"No x86-64 SysV va_arg implementation for {:?}",
391+
layout.layout.backend_repr()
392+
)
393+
}
394+
BackendRepr::Memory { .. } => {
395+
let mem_addr = x86_64_sysv64_va_arg_from_memory(bx, va_list_addr, layout);
396+
return bx.load(layout.llvm_type(bx), mem_addr, layout.align.abi);
397+
}
398+
};
399+
400+
// AMD64-ABI 3.5.7p5: Step 3. Verify whether arguments fit into
401+
// registers. In the case: l->gp_offset > 48 - num_gp * 8 or
402+
// l->fp_offset > 176 - num_fp * 16 go to step 7.
403+
404+
let unsigned_int_offset = 4;
405+
let ptr_offset = 8;
406+
let gp_offset_ptr = va_list_addr;
407+
let fp_offset_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(unsigned_int_offset));
408+
409+
let gp_offset_v = bx.load(bx.type_i32(), gp_offset_ptr, Align::from_bytes(8).unwrap());
410+
let fp_offset_v = bx.load(bx.type_i32(), fp_offset_ptr, Align::from_bytes(4).unwrap());
411+
412+
let mut use_regs = bx.const_bool(false);
413+
414+
if num_gp_registers > 0 {
415+
let max_offset_val = 48u32 - num_gp_registers * 8;
416+
let fits_in_gp = bx.icmp(IntPredicate::IntULE, gp_offset_v, bx.const_u32(max_offset_val));
417+
use_regs = fits_in_gp;
418+
}
419+
420+
if num_fp_registers > 0 {
421+
let max_offset_val = 176u32 - num_fp_registers * 16;
422+
let fits_in_fp = bx.icmp(IntPredicate::IntULE, fp_offset_v, bx.const_u32(max_offset_val));
423+
use_regs = if num_gp_registers > 0 { bx.and(use_regs, fits_in_fp) } else { fits_in_fp };
424+
}
425+
426+
let in_reg = bx.append_sibling_block("va_arg.in_reg");
427+
let in_mem = bx.append_sibling_block("va_arg.in_mem");
428+
let end = bx.append_sibling_block("va_arg.end");
429+
430+
bx.cond_br(use_regs, in_reg, in_mem);
431+
432+
// Emit code to load the value if it was passed in a register.
433+
bx.switch_to_block(in_reg);
434+
435+
// AMD64-ABI 3.5.7p5: Step 4. Fetch type from l->reg_save_area with
436+
// an offset of l->gp_offset and/or l->fp_offset. This may require
437+
// copying to a temporary location in case the parameter is passed
438+
// in different register classes or requires an alignment greater
439+
// than 8 for general purpose registers and 16 for XMM registers.
440+
//
441+
// FIXME(llvm): This really results in shameful code when we end up needing to
442+
// collect arguments from different places; often what should result in a
443+
// simple assembling of a structure from scattered addresses has many more
444+
// loads than necessary. Can we clean this up?
445+
let reg_save_area_ptr =
446+
bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(2 * unsigned_int_offset + ptr_offset));
447+
let reg_save_area_v = bx.load(bx.type_ptr(), reg_save_area_ptr, dl.pointer_align.abi);
448+
449+
let reg_addr = match layout.layout.backend_repr() {
450+
BackendRepr::Scalar(scalar) => match scalar.primitive() {
451+
Primitive::Int(_, _) | Primitive::Pointer(_) => {
452+
let reg_addr = bx.inbounds_ptradd(reg_save_area_v, gp_offset_v);
453+
454+
// Copy into a temporary if the type is more aligned than the register save area.
455+
let gp_align = Align::from_bytes(8).unwrap();
456+
copy_to_temporary_if_more_aligned(bx, reg_addr, layout, gp_align)
457+
}
458+
Primitive::Float(_) => bx.inbounds_ptradd(reg_save_area_v, fp_offset_v),
459+
},
460+
BackendRepr::ScalarPair(scalar1, scalar2) => {
461+
let ty_lo = bx.cx().scalar_pair_element_backend_type(layout, 0, false);
462+
let ty_hi = bx.cx().scalar_pair_element_backend_type(layout, 1, false);
463+
464+
let align_lo = layout.field(bx.cx, 0).layout.align().abi;
465+
let align_hi = layout.field(bx.cx, 1).layout.align().abi;
466+
467+
match (scalar1.primitive(), scalar2.primitive()) {
468+
(Primitive::Float(_), Primitive::Float(_)) => {
469+
// SSE registers are spaced 16 bytes apart in the register save
470+
// area, we need to collect the two eightbytes together.
471+
// The ABI isn't explicit about this, but it seems reasonable
472+
// to assume that the slots are 16-byte aligned, since the stack is
473+
// naturally 16-byte aligned and the prologue is expected to store
474+
// all the SSE registers to the RSA.
475+
let reg_lo_addr = bx.inbounds_ptradd(reg_save_area_v, fp_offset_v);
476+
let reg_hi_addr = bx.inbounds_ptradd(reg_lo_addr, bx.const_i32(16));
477+
478+
let align = layout.layout.align().abi;
479+
let tmp = bx.alloca(layout.layout.size(), align);
480+
481+
let reg_lo = bx.load(ty_lo, reg_lo_addr, align_lo);
482+
let reg_hi = bx.load(ty_hi, reg_hi_addr, align_hi);
483+
484+
let offset = scalar1.size(bx.cx).align_to(align_hi).bytes();
485+
let field0 = tmp;
486+
let field1 = bx.inbounds_ptradd(tmp, bx.const_u32(offset as u32));
487+
488+
bx.store(reg_lo, field0, align);
489+
bx.store(reg_hi, field1, align);
490+
491+
tmp
492+
}
493+
(Primitive::Float(_), _) | (_, Primitive::Float(_)) => {
494+
let gp_addr = bx.inbounds_ptradd(reg_save_area_v, gp_offset_v);
495+
let fp_addr = bx.inbounds_ptradd(reg_save_area_v, fp_offset_v);
496+
497+
let (reg_lo_addr, reg_hi_addr) = match scalar1.primitive() {
498+
Primitive::Float(_) => (fp_addr, gp_addr),
499+
Primitive::Int(_, _) | Primitive::Pointer(_) => (gp_addr, fp_addr),
500+
};
501+
502+
let tmp = bx.alloca(layout.layout.size(), layout.layout.align().abi);
503+
504+
let reg_lo = bx.load(ty_lo, reg_lo_addr, align_lo);
505+
let reg_hi = bx.load(ty_hi, reg_hi_addr, align_hi);
506+
507+
let offset = scalar1.size(bx.cx).align_to(align_hi).bytes();
508+
let field0 = tmp;
509+
let field1 = bx.inbounds_ptradd(tmp, bx.const_u32(offset as u32));
510+
511+
bx.store(reg_lo, field0, align_lo);
512+
bx.store(reg_hi, field1, align_hi);
513+
514+
tmp
515+
}
516+
(_, _) => {
517+
// Two integer/pointer values are just contiguous in memory.
518+
let reg_addr = bx.inbounds_ptradd(reg_save_area_v, gp_offset_v);
519+
520+
// Copy into a temporary if the type is more aligned than the register save area.
521+
let gp_align = Align::from_bytes(8).unwrap();
522+
copy_to_temporary_if_more_aligned(bx, reg_addr, layout, gp_align)
523+
}
524+
}
525+
}
526+
// The Previous match on `BackendRepr` means control flow already escaped.
527+
BackendRepr::SimdVector { .. } | BackendRepr::Memory { .. } => unreachable!(),
528+
};
529+
530+
// AMD64-ABI 3.5.7p5: Step 5. Set:
531+
// l->gp_offset = l->gp_offset + num_gp * 8
532+
if num_gp_registers > 0 {
533+
let offset = bx.const_u32(num_gp_registers * 8);
534+
let sum = bx.add(gp_offset_v, offset);
535+
// An alignment of 8 because `__va_list_tag` is 8-aligned and this is its first field.
536+
bx.store(sum, gp_offset_ptr, Align::from_bytes(8).unwrap());
537+
}
538+
539+
// l->fp_offset = l->fp_offset + num_fp * 16.
540+
if num_fp_registers > 0 {
541+
let offset = bx.const_u32(num_fp_registers * 16);
542+
let sum = bx.add(fp_offset_v, offset);
543+
bx.store(sum, fp_offset_ptr, Align::from_bytes(4).unwrap());
544+
}
545+
546+
bx.br(end);
547+
548+
bx.switch_to_block(in_mem);
549+
let mem_addr = x86_64_sysv64_va_arg_from_memory(bx, va_list_addr, layout);
550+
bx.br(end);
551+
552+
bx.switch_to_block(end);
553+
554+
let val_type = layout.llvm_type(bx);
555+
let val_addr = bx.phi(bx.type_ptr(), &[reg_addr, mem_addr], &[in_reg, in_mem]);
556+
557+
bx.load(val_type, val_addr, layout.align.abi)
558+
}
559+
560+
/// Copy into a temporary if the type is more aligned than the register save area.
561+
fn copy_to_temporary_if_more_aligned<'ll, 'tcx>(
562+
bx: &mut Builder<'_, 'll, 'tcx>,
563+
reg_addr: &'ll Value,
564+
layout: TyAndLayout<'tcx, Ty<'tcx>>,
565+
src_align: Align,
566+
) -> &'ll Value {
567+
if layout.layout.align.abi > src_align {
568+
let tmp = bx.alloca(layout.layout.size(), layout.layout.align().abi);
569+
bx.memcpy(
570+
tmp,
571+
layout.layout.align.abi,
572+
reg_addr,
573+
src_align,
574+
bx.const_u32(layout.layout.size().bytes() as u32),
575+
MemFlags::empty(),
576+
);
577+
tmp
578+
} else {
579+
reg_addr
580+
}
581+
}
582+
583+
fn x86_64_sysv64_va_arg_from_memory<'ll, 'tcx>(
584+
bx: &mut Builder<'_, 'll, 'tcx>,
585+
va_list_addr: &'ll Value,
586+
layout: TyAndLayout<'tcx, Ty<'tcx>>,
587+
) -> &'ll Value {
588+
let dl = bx.cx.data_layout();
589+
590+
let overflow_arg_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.const_usize(8));
591+
592+
let overflow_arg_area_v = bx.load(bx.type_ptr(), overflow_arg_area_ptr, dl.pointer_align.abi);
593+
// AMD64-ABI 3.5.7p5: Step 7. Align l->overflow_arg_area upwards to a 16
594+
// byte boundary if alignment needed by type exceeds 8 byte boundary.
595+
// It isn't stated explicitly in the standard, but in practice we use
596+
// alignment greater than 16 where necessary.
597+
if layout.layout.align.abi.bytes() > 8 {
598+
unreachable!("all instances of VaArgSafe have an alignment <= 8");
599+
}
600+
601+
// AMD64-ABI 3.5.7p5: Step 8. Fetch type from l->overflow_arg_area.
602+
let mem_addr = overflow_arg_area_v;
603+
604+
// AMD64-ABI 3.5.7p5: Step 9. Set l->overflow_arg_area to:
605+
// l->overflow_arg_area + sizeof(type).
606+
// AMD64-ABI 3.5.7p5: Step 10. Align l->overflow_arg_area upwards to
607+
// an 8 byte boundary.
608+
let size_in_bytes = layout.layout.size().bytes();
609+
let offset = bx.const_i32(size_in_bytes.next_multiple_of(8) as i32);
610+
let overflow_arg_area = bx.inbounds_ptradd(overflow_arg_area_v, offset);
611+
bx.store(overflow_arg_area, overflow_arg_area_ptr, dl.pointer_align.abi);
612+
613+
mem_addr
614+
}
615+
306616
fn emit_xtensa_va_arg<'ll, 'tcx>(
307617
bx: &mut Builder<'_, 'll, 'tcx>,
308618
list: OperandRef<'tcx, &'ll Value>,
@@ -334,8 +644,7 @@ fn emit_xtensa_va_arg<'ll, 'tcx>(
334644
// (*va).va_ndx
335645
let va_reg_offset = 4;
336646
let va_ndx_offset = va_reg_offset + 4;
337-
let offset_ptr =
338-
bx.inbounds_gep(bx.type_i8(), va_list_addr, &[bx.cx.const_usize(va_ndx_offset)]);
647+
let offset_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(va_ndx_offset));
339648

340649
let offset = bx.load(bx.type_i32(), offset_ptr, bx.tcx().data_layout.i32_align.abi);
341650
let offset = round_up_to_alignment(bx, offset, layout.align.abi);
@@ -356,11 +665,10 @@ fn emit_xtensa_va_arg<'ll, 'tcx>(
356665
bx.store(offset_next, offset_ptr, bx.tcx().data_layout.pointer_align.abi);
357666

358667
// (*va).va_reg
359-
let regsave_area_ptr =
360-
bx.inbounds_gep(bx.type_i8(), va_list_addr, &[bx.cx.const_usize(va_reg_offset)]);
668+
let regsave_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(va_reg_offset));
361669
let regsave_area =
362670
bx.load(bx.type_ptr(), regsave_area_ptr, bx.tcx().data_layout.pointer_align.abi);
363-
let regsave_value_ptr = bx.inbounds_gep(bx.type_i8(), regsave_area, &[offset]);
671+
let regsave_value_ptr = bx.inbounds_ptradd(regsave_area, offset);
364672
bx.br(end);
365673

366674
bx.switch_to_block(from_stack);
@@ -381,9 +689,9 @@ fn emit_xtensa_va_arg<'ll, 'tcx>(
381689
bx.store(offset_next_corrected, offset_ptr, bx.tcx().data_layout.pointer_align.abi);
382690

383691
// let stack_value_ptr = unsafe { (*va).va_stk.byte_add(offset_corrected) };
384-
let stack_area_ptr = bx.inbounds_gep(bx.type_i8(), va_list_addr, &[bx.cx.const_usize(0)]);
692+
let stack_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(0));
385693
let stack_area = bx.load(bx.type_ptr(), stack_area_ptr, bx.tcx().data_layout.pointer_align.abi);
386-
let stack_value_ptr = bx.inbounds_gep(bx.type_i8(), stack_area, &[offset_corrected]);
694+
let stack_value_ptr = bx.inbounds_ptradd(stack_area, offset_corrected);
387695
bx.br(end);
388696

389697
bx.switch_to_block(end);
@@ -449,6 +757,8 @@ pub(super) fn emit_va_arg<'ll, 'tcx>(
449757
AllowHigherAlign::No,
450758
)
451759
}
760+
// This includes `target.is_like_darwin`, which on x86_64 targets is like sysv64.
761+
"x86_64" => emit_x86_64_sysv64_va_arg(bx, addr, target_ty),
452762
"xtensa" => emit_xtensa_va_arg(bx, addr, target_ty),
453763
// For all other architecture/OS combinations fall back to using
454764
// the LLVM va_arg instruction.

tests/run-make/c-link-to-rust-va-list-fn/checkrust.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ pub unsafe extern "C" fn check_varargs_4(_: c_double, mut ap: ...) -> usize {
112112
continue_if!(ap.arg::<c_double>() == 8.0);
113113
continue_if!(ap.arg::<c_double>() == 9.0);
114114
continue_if!(ap.arg::<c_double>() == 10.0);
115+
continue_if!(ap.arg::<c_double>() == 11.0);
116+
continue_if!(ap.arg::<c_double>() == 12.0);
117+
continue_if!(ap.arg::<c_double>() == 13.0);
115118
0
116119
}
117120

@@ -137,5 +140,11 @@ pub unsafe extern "C" fn check_varargs_5(_: c_int, mut ap: ...) -> usize {
137140
continue_if!(ap.arg::<c_double>() == 9.0);
138141
continue_if!(ap.arg::<c_int>() == 10);
139142
continue_if!(ap.arg::<c_double>() == 10.0);
143+
continue_if!(ap.arg::<c_int>() == 11);
144+
continue_if!(ap.arg::<c_double>() == 11.0);
145+
continue_if!(ap.arg::<c_int>() == 12);
146+
continue_if!(ap.arg::<c_double>() == 12.0);
147+
continue_if!(ap.arg::<c_int>() == 13);
148+
continue_if!(ap.arg::<c_double>() == 13.0);
140149
0
141150
}

0 commit comments

Comments
 (0)