@@ -424,6 +424,34 @@ LIB8STATIC_ALWAYS_INLINE uint16_t scale16by8( uint16_t i, fract8 scale )
424
424
#endif
425
425
return result ;
426
426
#elif SCALE16BY8_AVRASM == 1
427
+ #if FASTLED_SCALE8_FIXED == 1
428
+ uint16_t result = 0 ;
429
+ asm volatile (
430
+ // result.A = HighByte( (i.A x scale) + i.A )
431
+ " mul %A[i], %[scale] \n\t"
432
+ " add r0, %A[i] \n\t"
433
+ // " adc r1, [zero] \n\t"
434
+ // " mov %A[result], r1 \n\t"
435
+ " adc %A[result], r1 \n\t"
436
+
437
+ // result.A-B += i.B x scale
438
+ " mul %B[i], %[scale] \n\t"
439
+ " add %A[result], r0 \n\t"
440
+ " adc %B[result], r1 \n\t"
441
+
442
+ // cleanup r1
443
+ " clr __zero_reg__ \n\t"
444
+
445
+ // result.A-B += i.B
446
+ " add %A[result], %B[i] \n\t"
447
+ " adc %B[result], __zero_reg__ \n\t"
448
+
449
+ : [result ] "+r" (result )
450
+ : [i ] "r" (i ), [scale ] "r" (scale )
451
+ : "r0" , "r1"
452
+ );
453
+ return result ;
454
+ #else
427
455
uint16_t result = 0 ;
428
456
asm volatile (
429
457
// result.A = HighByte(i.A x j )
@@ -444,6 +472,7 @@ LIB8STATIC_ALWAYS_INLINE uint16_t scale16by8( uint16_t i, fract8 scale )
444
472
: "r0" , "r1"
445
473
);
446
474
return result ;
475
+ #endif
447
476
#else
448
477
#error "No implementation for scale16by8 available."
449
478
#endif
@@ -464,6 +493,14 @@ LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
464
493
#endif
465
494
return result ;
466
495
#elif SCALE16_AVRASM == 1
496
+ #if FASTLED_SCALE8_FIXED == 1
497
+ // implemented sort of like
498
+ // result = ((i * scale) + i ) / 65536
499
+ //
500
+ // why not like this, you may ask?
501
+ // result = (i * (scale+1)) / 65536
502
+ // the answer is that if scale is 65535, then scale+1
503
+ // will be zero, which is not what we want.
467
504
uint32_t result ;
468
505
asm volatile (
469
506
// result.A-B = i.A x scale.A
@@ -474,7 +511,80 @@ LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
474
511
//" mov %B[result], r1 \n\t"
475
512
// which can be written as...
476
513
" movw %A[result], r0 \n\t"
477
- // We actually need to do anything with r0,
514
+ // Because we're going to add i.A-B to
515
+ // result.A-D, we DO need to keep both
516
+ // the r0 and r1 portions of the product
517
+ // UNlike in the 'unfixed scale8' version.
518
+ // So the movw here is needed.
519
+ : [result ] "=r" (result )
520
+ : [i ] "r" (i ),
521
+ [scale ] "r" (scale )
522
+ : "r0" , "r1"
523
+ );
524
+
525
+ asm volatile (
526
+ // result.C-D = i.B x scale.B
527
+ " mul %B[i], %B[scale] \n\t"
528
+ //" mov %C[result], r0 \n\t"
529
+ //" mov %D[result], r1 \n\t"
530
+ " movw %C[result], r0 \n\t"
531
+ : [result ] "+r" (result )
532
+ : [i ] "r" (i ),
533
+ [scale ] "r" (scale )
534
+ : "r0" , "r1"
535
+ );
536
+
537
+ const uint8_t zero = 0 ;
538
+ asm volatile (
539
+ // result.B-D += i.B x scale.A
540
+ " mul %B[i], %A[scale] \n\t"
541
+
542
+ " add %B[result], r0 \n\t"
543
+ " adc %C[result], r1 \n\t"
544
+ " adc %D[result], %[zero] \n\t"
545
+
546
+ // result.B-D += i.A x scale.B
547
+ " mul %A[i], %B[scale] \n\t"
548
+
549
+ " add %B[result], r0 \n\t"
550
+ " adc %C[result], r1 \n\t"
551
+ " adc %D[result], %[zero] \n\t"
552
+
553
+ // cleanup r1
554
+ " clr r1 \n\t"
555
+
556
+ : [result ] "+r" (result )
557
+ : [i ] "r" (i ),
558
+ [scale ] "r" (scale ),
559
+ [zero ] "r" (zero )
560
+ : "r0" , "r1"
561
+ );
562
+
563
+ asm volatile (
564
+ // result.A-D += i.A-B
565
+ " add %A[result], %A[i] \n\t"
566
+ " adc %B[result], %B[i] \n\t"
567
+ " adc %C[result], %[zero] \n\t"
568
+ " adc %D[result], %[zero] \n\t"
569
+ : [result ] "+r" (result )
570
+ : [i ] "r" (i ),
571
+ [zero ] "r" (zero )
572
+ );
573
+
574
+ result = result >> 16 ;
575
+ return result ;
576
+ #else
577
+ uint32_t result ;
578
+ asm volatile (
579
+ // result.A-B = i.A x scale.A
580
+ " mul %A[i], %A[scale] \n\t"
581
+ // save results...
582
+ // basic idea:
583
+ //" mov %A[result], r0 \n\t"
584
+ //" mov %B[result], r1 \n\t"
585
+ // which can be written as...
586
+ " movw %A[result], r0 \n\t"
587
+ // We actually don't need to do anything with r0,
478
588
// as result.A is never used again here, so we
479
589
// could just move the high byte, but movw is
480
590
// one clock cycle, just like mov, so might as
@@ -527,6 +637,7 @@ LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
527
637
528
638
result = result >> 16 ;
529
639
return result ;
640
+ #endif
530
641
#else
531
642
#error "No implementation for scale16 available."
532
643
#endif
0 commit comments