896 {
899
900 if(strideA==-1) strideA = depth;
901 if(strideB==-1) strideB = depth;
902 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
903 Index packet_cols4 = nr>=4 ? (
cols/4) * 4 : 0;
907 enum { pk = 8 };
908 const Index peeled_kc = depth & ~(pk-1);
910
911
912
913
914
916 {
917
918
919
920
921
923
924
925
926 const Index actual_panel_rows = (3*
LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(
ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*
LhsProgress) ));
927 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
928 {
929 const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
930 for(
Index j2=0; j2<packet_cols4; j2+=nr)
931 {
933 {
934
935
936
937
938 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*
LhsProgress)];
940
941
943 C4, C5, C6, C7,
944 C8, C9, C10, C11;
945 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
946 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
947 traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
948
953
954 r0.prefetch(0);
955 r1.prefetch(0);
956 r2.prefetch(0);
957 r3.prefetch(0);
958
959
960 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
963
964 for(
Index k=0; k<peeled_kc; k+=pk)
965 {
969
970#define EIGEN_GEBP_ONESTEP(K) \
971 do { \
972 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
973 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
974 internal::prefetch(blA+(3*K+16)*LhsProgress); \
975 if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } \
976 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
977 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
978 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
979 traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
980 traits.madd(A0, B_0, C0, T0); \
981 traits.madd(A1, B_0, C4, T0); \
982 traits.madd(A2, B_0, C8, B_0); \
983 traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
984 traits.madd(A0, B_0, C1, T0); \
985 traits.madd(A1, B_0, C5, T0); \
986 traits.madd(A2, B_0, C9, B_0); \
987 traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
988 traits.madd(A0, B_0, C2, T0); \
989 traits.madd(A1, B_0, C6, T0); \
990 traits.madd(A2, B_0, C10, B_0); \
991 traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
992 traits.madd(A0, B_0, C3 , T0); \
993 traits.madd(A1, B_0, C7, T0); \
994 traits.madd(A2, B_0, C11, B_0); \
995 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
996 } while(false)
997
1007
1010
1012 }
1013
1014 for(
Index k=peeled_kc; k<depth; k++)
1015 {
1021 }
1022
1023#undef EIGEN_GEBP_ONESTEP
1024
1026 ResPacket alphav = pset1<ResPacket>(alpha);
1027
1031 traits.acc(C0, alphav, R0);
1032 traits.acc(C4, alphav, R1);
1033 traits.acc(C8, alphav, R2);
1037
1041 traits.acc(C1, alphav, R0);
1042 traits.acc(C5, alphav, R1);
1043 traits.acc(C9, alphav, R2);
1047
1051 traits.acc(C2, alphav, R0);
1052 traits.acc(C6, alphav, R1);
1053 traits.acc(C10, alphav, R2);
1057
1061 traits.acc(C3, alphav, R0);
1062 traits.acc(C7, alphav, R1);
1063 traits.acc(C11, alphav, R2);
1067 }
1068 }
1069
1070
1071 for(
Index j2=packet_cols4; j2<
cols; j2++)
1072 {
1074 {
1075
1078
1079
1081 traits.initAcc(C0);
1082 traits.initAcc(C4);
1083 traits.initAcc(C8);
1084
1086 r0.prefetch(0);
1087
1088
1089 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1091
1092 for(
Index k=0; k<peeled_kc; k+=pk)
1093 {
1096#define EIGEN_GEBGP_ONESTEP(K) \
1097 do { \
1098 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1099 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1100 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1101 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1102 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1103 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1104 traits.madd(A0, B_0, C0, B_0); \
1105 traits.madd(A1, B_0, C4, B_0); \
1106 traits.madd(A2, B_0, C8, B_0); \
1107 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1108 } while(false)
1109
1118
1121
1123 }
1124
1125
1126 for(
Index k=peeled_kc; k<depth; k++)
1127 {
1132 }
1133#undef EIGEN_GEBGP_ONESTEP
1135 ResPacket alphav = pset1<ResPacket>(alpha);
1136
1140 traits.acc(C0, alphav, R0);
1141 traits.acc(C4, alphav, R1);
1142 traits.acc(C8, alphav, R2);
1146 }
1147 }
1148 }
1149 }
1150
1151
1153 {
1155
1156
1157
1159
1160 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1161 {
1162 Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1163 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1164 {
1166 {
1167
1168
1169
1170
1173
1174
1176 C4, C5, C6, C7;
1177 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1178 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1179
1184
1185 r0.prefetch(prefetch_res_offset);
1186 r1.prefetch(prefetch_res_offset);
1187 r2.prefetch(prefetch_res_offset);
1188 r3.prefetch(prefetch_res_offset);
1189
1190
1191 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1194
1195 for(
Index k=0; k<peeled_kc; k+=pk)
1196 {
1199
1200
1201
1202 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1203 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1204 #else
1205 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1206 #endif
1207 #define EIGEN_GEBGP_ONESTEP(K) \
1208 do { \
1209 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1210 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1211 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1212 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1213 traits.madd(A0, B_0, C0, T0); \
1214 traits.madd(A1, B_0, C4, B_0); \
1215 traits.madd(A0, B1, C1, T0); \
1216 traits.madd(A1, B1, C5, B1); \
1217 traits.madd(A0, B2, C2, T0); \
1218 traits.madd(A1, B2, C6, B2); \
1219 traits.madd(A0, B3, C3, T0); \
1220 traits.madd(A1, B3, C7, B3); \
1221 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1222 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1223 } while(false)
1224
1235
1238
1240 }
1241
1242 for(
Index k=peeled_kc; k<depth; k++)
1243 {
1248 }
1249#undef EIGEN_GEBGP_ONESTEP
1250
1252 ResPacket alphav = pset1<ResPacket>(alpha);
1253
1258 traits.acc(C0, alphav, R0);
1259 traits.acc(C4, alphav, R1);
1260 traits.acc(C1, alphav, R2);
1261 traits.acc(C5, alphav, R3);
1266
1271 traits.acc(C2, alphav, R0);
1272 traits.acc(C6, alphav, R1);
1273 traits.acc(C3, alphav, R2);
1274 traits.acc(C7, alphav, R3);
1279 }
1280 }
1281
1282
1283 for(
Index j2=packet_cols4; j2<
cols; j2++)
1284 {
1286 {
1287
1290
1291
1293 traits.initAcc(C0);
1294 traits.initAcc(C4);
1295
1297 r0.prefetch(prefetch_res_offset);
1298
1299
1300 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1302
1303 for(
Index k=0; k<peeled_kc; k+=pk)
1304 {
1307
1308#define EIGEN_GEBGP_ONESTEP(K) \
1309 do { \
1310 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1311 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1312 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1313 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1314 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1315 traits.madd(A0, B_0, C0, B1); \
1316 traits.madd(A1, B_0, C4, B_0); \
1317 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1318 } while(false)
1319
1328
1331
1333 }
1334
1335
1336 for(
Index k=peeled_kc; k<depth; k++)
1337 {
1342 }
1343#undef EIGEN_GEBGP_ONESTEP
1345 ResPacket alphav = pset1<ResPacket>(alpha);
1346
1349 traits.acc(C0, alphav, R0);
1350 traits.acc(C4, alphav, R1);
1353 }
1354 }
1355 }
1356 }
1357
1359 {
1360
1362 {
1363
1364 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1365 {
1366
1367
1368
1371
1372
1374 traits.initAcc(C0);
1375 traits.initAcc(C1);
1376 traits.initAcc(C2);
1377 traits.initAcc(C3);
1378
1383
1384 r0.prefetch(prefetch_res_offset);
1385 r1.prefetch(prefetch_res_offset);
1386 r2.prefetch(prefetch_res_offset);
1387 r3.prefetch(prefetch_res_offset);
1388
1389
1390 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1393
1394 for(
Index k=0; k<peeled_kc; k+=pk)
1395 {
1398
1399#define EIGEN_GEBGP_ONESTEP(K) \
1400 do { \
1401 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1402 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1403 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1404 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1405 traits.madd(A0, B_0, C0, B_0); \
1406 traits.madd(A0, B1, C1, B1); \
1407 traits.madd(A0, B2, C2, B2); \
1408 traits.madd(A0, B3, C3, B3); \
1409 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1410 } while(false)
1411
1422
1425
1427 }
1428
1429 for(
Index k=peeled_kc; k<depth; k++)
1430 {
1435 }
1436#undef EIGEN_GEBGP_ONESTEP
1437
1439 ResPacket alphav = pset1<ResPacket>(alpha);
1440
1443 traits.acc(C0, alphav, R0);
1444 traits.acc(C1, alphav, R1);
1447
1450 traits.acc(C2, alphav, R0);
1451 traits.acc(C3, alphav, R1);
1454 }
1455
1456
1457 for(
Index j2=packet_cols4; j2<
cols; j2++)
1458 {
1459
1462
1463
1465 traits.initAcc(C0);
1466
1468
1469
1470 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1472
1473 for(
Index k=0; k<peeled_kc; k+=pk)
1474 {
1477
1478#define EIGEN_GEBGP_ONESTEP(K) \
1479 do { \
1480 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1481 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1482 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1483 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1484 traits.madd(A0, B_0, C0, B_0); \
1485 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1486 } while(false);
1487
1496
1499
1501 }
1502
1503
1504 for(
Index k=peeled_kc; k<depth; k++)
1505 {
1510 }
1511#undef EIGEN_GEBGP_ONESTEP
1513 ResPacket alphav = pset1<ResPacket>(alpha);
1515 traits.acc(C0, alphav, R0);
1517 }
1518 }
1519 }
1520
1521 if(peeled_mc1<rows)
1522 {
1523
1524 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1525 {
1526
1528 {
1529 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1531 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1532
1533
1534
1535
1536 const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1540 {
1542 straits.initAcc(C0);
1543 straits.initAcc(C1);
1544 straits.initAcc(C2);
1545 straits.initAcc(C3);
1546
1548 const Index endk = (depth/spk)*spk;
1549 const Index endk4 = (depth/(spk*4))*(spk*4);
1550
1552 for(; k<endk4; k+=4*spk)
1553 {
1556
1559
1560 straits.loadRhsQuad(blA+0*spk, B_0);
1561 straits.loadRhsQuad(blA+1*spk, B_1);
1562 straits.madd(A0,B_0,C0,B_0);
1563 straits.madd(A1,B_1,C1,B_1);
1564
1567 straits.loadRhsQuad(blA+2*spk, B_0);
1568 straits.loadRhsQuad(blA+3*spk, B_1);
1569 straits.madd(A0,B_0,C2,B_0);
1570 straits.madd(A1,B_1,C3,B_1);
1571
1573 blA += 4*spk;
1574 }
1576 for(; k<endk; k+=spk)
1577 {
1580
1581 straits.loadLhsUnaligned(blB, A0);
1582 straits.loadRhsQuad(blA, B_0);
1583 straits.madd(A0,B_0,C0,B_0);
1584
1586 blA += spk;
1587 }
1589 {
1590
1595
1596 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1597 SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1598
1599 if(depth-endk>0)
1600 {
1601
1602 SLhsPacketHalf a0;
1603 SRhsPacketHalf b0;
1604 straits.loadLhsUnaligned(blB, a0);
1605 straits.loadRhs(blA, b0);
1607 straits.madd(a0,b0,c0,b0);
1608 straits.acc(c0, alphav, R);
1609 }
1610 else
1611 {
1613 }
1614 res.scatterPacket(i, j2, R);
1615 }
1616 else
1617 {
1618 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1619 SResPacket alphav = pset1<SResPacket>(alpha);
1620 straits.acc(C0, alphav, R);
1621 res.scatterPacket(i, j2, R);
1622 }
1623 }
1624 else
1625 {
1626
1628
1629 for(
Index k=0; k<depth; k++)
1630 {
1631 LhsScalar A0;
1632 RhsScalar B_0, B_1;
1633
1634 A0 = blA[k];
1635
1636 B_0 = blB[0];
1637 B_1 = blB[1];
1638 CJMADD(cj,A0,B_0,C0, B_0);
1639 CJMADD(cj,A0,B_1,C1, B_1);
1640
1641 B_0 = blB[2];
1642 B_1 = blB[3];
1643 CJMADD(cj,A0,B_0,C2, B_0);
1644 CJMADD(cj,A0,B_1,C3, B_1);
1645
1646 blB += 4;
1647 }
1648 res(i, j2 + 0) += alpha * C0;
1649 res(i, j2 + 1) += alpha * C1;
1650 res(i, j2 + 2) += alpha * C2;
1651 res(i, j2 + 3) += alpha * C3;
1652 }
1653 }
1654 }
1655
1656 for(
Index j2=packet_cols4; j2<
cols; j2++)
1657 {
1658
1660 {
1661 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1663
1665 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1666 for(
Index k=0; k<depth; k++)
1667 {
1668 LhsScalar A0 = blA[k];
1669 RhsScalar B_0 = blB[k];
1670 CJMADD(cj, A0, B_0, C0, B_0);
1671 }
1672 res(i, j2) += alpha * C0;
1673 }
1674 }
1675 }
1676 }
#define CJMADD(CJ, A, B, C, T)
Definition GeneralBlockPanelKernel.h:336
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_GEBP_ONESTEP(K)
#define EIGEN_ASM_COMMENT(X)
Definition Macros.h:624
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:151
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition GenericPacketMath.h:299
const std::ptrdiff_t defaultL1CacheSize
Definition GeneralBlockPanelKernel.h:33
EIGEN_DEVICE_FUNC conditional<(unpacket_traits< Packet >::size%8)==0, typenameunpacket_traits< Packet >::half, Packet >::type predux_downto4(const Packet &a)
Definition GenericPacketMath.h:332
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:33
size_t cols(const T &raster)
Definition MarchingSquares.hpp:60
size_t rows(const T &raster)
Definition MarchingSquares.hpp:55
DataMapper::LinearMapper LinearMapper
Definition GeneralBlockPanelKernel.h:875
Traits::RhsPacket RhsPacket
Definition GeneralBlockPanelKernel.h:864
SwappedTraits::ResPacket SResPacket
Definition GeneralBlockPanelKernel.h:872
SwappedTraits::AccPacket SAccPacket
Definition GeneralBlockPanelKernel.h:873
SwappedTraits::RhsPacket SRhsPacket
Definition GeneralBlockPanelKernel.h:871
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > Traits
Definition GeneralBlockPanelKernel.h:861
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs > SwappedTraits
Definition GeneralBlockPanelKernel.h:868
Traits::LhsPacket LhsPacket
Definition GeneralBlockPanelKernel.h:863
Traits::ResScalar ResScalar
Definition GeneralBlockPanelKernel.h:862
SwappedTraits::LhsPacket SLhsPacket
Definition GeneralBlockPanelKernel.h:870
Traits::ResPacket ResPacket
Definition GeneralBlockPanelKernel.h:865
Traits::AccPacket AccPacket
Definition GeneralBlockPanelKernel.h:866
T half
Definition XprHelper.h:161