Prusa Slicer 2.6.0
Loading...
Searching...
No Matches
Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs > Struct Template Reference

#include <src/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h>

Public Types

enum  { Vectorizable = Traits::Vectorizable , LhsProgress = Traits::LhsProgress , RhsProgress = Traits::RhsProgress , ResPacketSize = Traits::ResPacketSize }
 
typedef gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > Traits
 
typedef Traits::ResScalar ResScalar
 
typedef Traits::LhsPacket LhsPacket
 
typedef Traits::RhsPacket RhsPacket
 
typedef Traits::ResPacket ResPacket
 
typedef Traits::AccPacket AccPacket
 
typedef gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs > SwappedTraits
 
typedef SwappedTraits::ResScalar SResScalar
 
typedef SwappedTraits::LhsPacket SLhsPacket
 
typedef SwappedTraits::RhsPacket SRhsPacket
 
typedef SwappedTraits::ResPacket SResPacket
 
typedef SwappedTraits::AccPacket SAccPacket
 
typedef DataMapper::LinearMapper LinearMapper
 

Public Member Functions

EIGEN_DONT_INLINE void operator() (const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
 

Detailed Description

template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >

Member Typedef Documentation

◆ AccPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::AccPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::AccPacket

◆ LhsPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::LhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::LhsPacket

◆ LinearMapper

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef DataMapper::LinearMapper Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::LinearMapper

◆ ResPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::ResPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::ResPacket

◆ ResScalar

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::ResScalar Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::ResScalar

◆ RhsPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::RhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::RhsPacket

◆ SAccPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::AccPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SAccPacket

◆ SLhsPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::LhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SLhsPacket

◆ SResPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::ResPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SResPacket

◆ SResScalar

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::ResScalar Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SResScalar

◆ SRhsPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::RhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SRhsPacket

◆ SwappedTraits

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SwappedTraits

◆ Traits

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::Traits

Member Enumeration Documentation

◆ anonymous enum

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
anonymous enum
Enumerator
Vectorizable 
LhsProgress 
RhsProgress 
ResPacketSize 
877 {
882 };
@ ResPacketSize
Definition GeneralBlockPanelKernel.h:364
@ LhsProgress
Definition GeneralBlockPanelKernel.h:382
@ RhsProgress
Definition GeneralBlockPanelKernel.h:383
@ Vectorizable
Definition GeneralBlockPanelKernel.h:361
@ ResPacketSize
Definition GeneralBlockPanelKernel.h:881
@ Vectorizable
Definition GeneralBlockPanelKernel.h:878
@ RhsProgress
Definition GeneralBlockPanelKernel.h:880
@ LhsProgress
Definition GeneralBlockPanelKernel.h:879

Member Function Documentation

◆ operator()()

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE void Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::operator() ( const DataMapper &  res,
const LhsScalar *  blockA,
const RhsScalar *  blockB,
Index  rows,
Index  depth,
Index  cols,
ResScalar  alpha,
Index  strideA = -1,
Index  strideB = -1,
Index  offsetA = 0,
Index  offsetB = 0 
)
896 {
897 Traits traits;
898 SwappedTraits straits;
899
900 if(strideA==-1) strideA = depth;
901 if(strideB==-1) strideB = depth;
902 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
903 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
904 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
905 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
906 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
907 enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
908 const Index peeled_kc = depth & ~(pk-1);
909 const Index prefetch_res_offset = 32/sizeof(ResScalar);
910// const Index depth2 = depth & ~1;
911
912 //---------- Process 3 * LhsProgress rows at once ----------
913 // This corresponds to 3*LhsProgress x nr register blocks.
914 // Usually, make sense only with FMA
915 if(mr>=3*Traits::LhsProgress)
916 {
917 // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
918 // and on each largest micro vertical panel of the rhs (depth * nr).
919 // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
920 // However, if depth is too small, we can extend the number of rows of these horizontal panels.
921 // This actual number of rows is computed as follow:
922 const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
923 // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
924 // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
925 // or because we are testing specific blocking sizes.
926 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
927 for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
928 {
929 const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
930 for(Index j2=0; j2<packet_cols4; j2+=nr)
931 {
932 for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
933 {
934
935 // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
936 // stored into 3 x nr registers.
937
938 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
939 prefetch(&blA[0]);
940
941 // gets res block as register
942 AccPacket C0, C1, C2, C3,
943 C4, C5, C6, C7,
944 C8, C9, C10, C11;
945 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
946 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
947 traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
948
949 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
950 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
951 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
952 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
953
954 r0.prefetch(0);
955 r1.prefetch(0);
956 r2.prefetch(0);
957 r3.prefetch(0);
958
959 // performs "inner" products
960 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
961 prefetch(&blB[0]);
962 LhsPacket A0, A1;
963
964 for(Index k=0; k<peeled_kc; k+=pk)
965 {
966 EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
967 RhsPacket B_0, T0;
968 LhsPacket A2;
969
970#define EIGEN_GEBP_ONESTEP(K) \
971 do { \
972 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
973 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
974 internal::prefetch(blA+(3*K+16)*LhsProgress); \
975 if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
976 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
977 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
978 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
979 traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
980 traits.madd(A0, B_0, C0, T0); \
981 traits.madd(A1, B_0, C4, T0); \
982 traits.madd(A2, B_0, C8, B_0); \
983 traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
984 traits.madd(A0, B_0, C1, T0); \
985 traits.madd(A1, B_0, C5, T0); \
986 traits.madd(A2, B_0, C9, B_0); \
987 traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
988 traits.madd(A0, B_0, C2, T0); \
989 traits.madd(A1, B_0, C6, T0); \
990 traits.madd(A2, B_0, C10, B_0); \
991 traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
992 traits.madd(A0, B_0, C3 , T0); \
993 traits.madd(A1, B_0, C7, T0); \
994 traits.madd(A2, B_0, C11, B_0); \
995 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
996 } while(false)
997
1007
1008 blB += pk*4*RhsProgress;
1009 blA += pk*3*Traits::LhsProgress;
1010
1011 EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1012 }
1013 // process remaining peeled loop
1014 for(Index k=peeled_kc; k<depth; k++)
1015 {
1016 RhsPacket B_0, T0;
1017 LhsPacket A2;
1019 blB += 4*RhsProgress;
1020 blA += 3*Traits::LhsProgress;
1021 }
1022
1023#undef EIGEN_GEBP_ONESTEP
1024
1025 ResPacket R0, R1, R2;
1026 ResPacket alphav = pset1<ResPacket>(alpha);
1027
1028 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1029 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1030 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1031 traits.acc(C0, alphav, R0);
1032 traits.acc(C4, alphav, R1);
1033 traits.acc(C8, alphav, R2);
1034 r0.storePacket(0 * Traits::ResPacketSize, R0);
1035 r0.storePacket(1 * Traits::ResPacketSize, R1);
1036 r0.storePacket(2 * Traits::ResPacketSize, R2);
1037
1038 R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1039 R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1040 R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1041 traits.acc(C1, alphav, R0);
1042 traits.acc(C5, alphav, R1);
1043 traits.acc(C9, alphav, R2);
1044 r1.storePacket(0 * Traits::ResPacketSize, R0);
1045 r1.storePacket(1 * Traits::ResPacketSize, R1);
1046 r1.storePacket(2 * Traits::ResPacketSize, R2);
1047
1048 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1049 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1050 R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1051 traits.acc(C2, alphav, R0);
1052 traits.acc(C6, alphav, R1);
1053 traits.acc(C10, alphav, R2);
1054 r2.storePacket(0 * Traits::ResPacketSize, R0);
1055 r2.storePacket(1 * Traits::ResPacketSize, R1);
1056 r2.storePacket(2 * Traits::ResPacketSize, R2);
1057
1058 R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1059 R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1060 R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1061 traits.acc(C3, alphav, R0);
1062 traits.acc(C7, alphav, R1);
1063 traits.acc(C11, alphav, R2);
1064 r3.storePacket(0 * Traits::ResPacketSize, R0);
1065 r3.storePacket(1 * Traits::ResPacketSize, R1);
1066 r3.storePacket(2 * Traits::ResPacketSize, R2);
1067 }
1068 }
1069
1070 // Deal with remaining columns of the rhs
1071 for(Index j2=packet_cols4; j2<cols; j2++)
1072 {
1073 for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1074 {
1075 // One column at a time
1076 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1077 prefetch(&blA[0]);
1078
1079 // gets res block as register
1080 AccPacket C0, C4, C8;
1081 traits.initAcc(C0);
1082 traits.initAcc(C4);
1083 traits.initAcc(C8);
1084
1085 LinearMapper r0 = res.getLinearMapper(i, j2);
1086 r0.prefetch(0);
1087
1088 // performs "inner" products
1089 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1090 LhsPacket A0, A1, A2;
1091
1092 for(Index k=0; k<peeled_kc; k+=pk)
1093 {
1094 EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1095 RhsPacket B_0;
1096#define EIGEN_GEBGP_ONESTEP(K) \
1097 do { \
1098 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1099 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1100 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1101 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1102 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1103 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1104 traits.madd(A0, B_0, C0, B_0); \
1105 traits.madd(A1, B_0, C4, B_0); \
1106 traits.madd(A2, B_0, C8, B_0); \
1107 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1108 } while(false)
1109
1118
1119 blB += pk*RhsProgress;
1120 blA += pk*3*Traits::LhsProgress;
1121
1122 EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1123 }
1124
1125 // process remaining peeled loop
1126 for(Index k=peeled_kc; k<depth; k++)
1127 {
1128 RhsPacket B_0;
1130 blB += RhsProgress;
1131 blA += 3*Traits::LhsProgress;
1132 }
1133#undef EIGEN_GEBGP_ONESTEP
1134 ResPacket R0, R1, R2;
1135 ResPacket alphav = pset1<ResPacket>(alpha);
1136
1137 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1138 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1139 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1140 traits.acc(C0, alphav, R0);
1141 traits.acc(C4, alphav, R1);
1142 traits.acc(C8, alphav, R2);
1143 r0.storePacket(0 * Traits::ResPacketSize, R0);
1144 r0.storePacket(1 * Traits::ResPacketSize, R1);
1145 r0.storePacket(2 * Traits::ResPacketSize, R2);
1146 }
1147 }
1148 }
1149 }
1150
1151 //---------- Process 2 * LhsProgress rows at once ----------
1152 if(mr>=2*Traits::LhsProgress)
1153 {
1154 const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1155 // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1156 // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1157 // or because we are testing specific blocking sizes.
1158 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1159
1160 for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1161 {
1162 Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1163 for(Index j2=0; j2<packet_cols4; j2+=nr)
1164 {
1165 for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1166 {
1167
1168 // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1169 // stored into 2 x nr registers.
1170
1171 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1172 prefetch(&blA[0]);
1173
1174 // gets res block as register
1175 AccPacket C0, C1, C2, C3,
1176 C4, C5, C6, C7;
1177 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1178 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1179
1180 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1181 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1182 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1183 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1184
1185 r0.prefetch(prefetch_res_offset);
1186 r1.prefetch(prefetch_res_offset);
1187 r2.prefetch(prefetch_res_offset);
1188 r3.prefetch(prefetch_res_offset);
1189
1190 // performs "inner" products
1191 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1192 prefetch(&blB[0]);
1193 LhsPacket A0, A1;
1194
1195 for(Index k=0; k<peeled_kc; k+=pk)
1196 {
1197 EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1198 RhsPacket B_0, B1, B2, B3, T0;
1199
1200 // NOTE: the begin/end asm comments below work around bug 935!
1201 // but they are not enough for gcc>=6 without FMA (bug 1637)
1202 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1203 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1204 #else
1205 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1206 #endif
1207 #define EIGEN_GEBGP_ONESTEP(K) \
1208 do { \
1209 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1210 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1211 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1212 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1213 traits.madd(A0, B_0, C0, T0); \
1214 traits.madd(A1, B_0, C4, B_0); \
1215 traits.madd(A0, B1, C1, T0); \
1216 traits.madd(A1, B1, C5, B1); \
1217 traits.madd(A0, B2, C2, T0); \
1218 traits.madd(A1, B2, C6, B2); \
1219 traits.madd(A0, B3, C3, T0); \
1220 traits.madd(A1, B3, C7, B3); \
1221 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1222 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1223 } while(false)
1224
1225 internal::prefetch(blB+(48+0));
1230 internal::prefetch(blB+(48+16));
1235
1236 blB += pk*4*RhsProgress;
1237 blA += pk*(2*Traits::LhsProgress);
1238
1239 EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1240 }
1241 // process remaining peeled loop
1242 for(Index k=peeled_kc; k<depth; k++)
1243 {
1244 RhsPacket B_0, B1, B2, B3, T0;
1246 blB += 4*RhsProgress;
1247 blA += 2*Traits::LhsProgress;
1248 }
1249#undef EIGEN_GEBGP_ONESTEP
1250
1251 ResPacket R0, R1, R2, R3;
1252 ResPacket alphav = pset1<ResPacket>(alpha);
1253
1254 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1255 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1256 R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1257 R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1258 traits.acc(C0, alphav, R0);
1259 traits.acc(C4, alphav, R1);
1260 traits.acc(C1, alphav, R2);
1261 traits.acc(C5, alphav, R3);
1262 r0.storePacket(0 * Traits::ResPacketSize, R0);
1263 r0.storePacket(1 * Traits::ResPacketSize, R1);
1264 r1.storePacket(0 * Traits::ResPacketSize, R2);
1265 r1.storePacket(1 * Traits::ResPacketSize, R3);
1266
1267 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1268 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1269 R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1270 R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1271 traits.acc(C2, alphav, R0);
1272 traits.acc(C6, alphav, R1);
1273 traits.acc(C3, alphav, R2);
1274 traits.acc(C7, alphav, R3);
1275 r2.storePacket(0 * Traits::ResPacketSize, R0);
1276 r2.storePacket(1 * Traits::ResPacketSize, R1);
1277 r3.storePacket(0 * Traits::ResPacketSize, R2);
1278 r3.storePacket(1 * Traits::ResPacketSize, R3);
1279 }
1280 }
1281
1282 // Deal with remaining columns of the rhs
1283 for(Index j2=packet_cols4; j2<cols; j2++)
1284 {
1285 for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1286 {
1287 // One column at a time
1288 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1289 prefetch(&blA[0]);
1290
1291 // gets res block as register
1292 AccPacket C0, C4;
1293 traits.initAcc(C0);
1294 traits.initAcc(C4);
1295
1296 LinearMapper r0 = res.getLinearMapper(i, j2);
1297 r0.prefetch(prefetch_res_offset);
1298
1299 // performs "inner" products
1300 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1301 LhsPacket A0, A1;
1302
1303 for(Index k=0; k<peeled_kc; k+=pk)
1304 {
1305 EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1306 RhsPacket B_0, B1;
1307
1308#define EIGEN_GEBGP_ONESTEP(K) \
1309 do { \
1310 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1311 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1312 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1313 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1314 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1315 traits.madd(A0, B_0, C0, B1); \
1316 traits.madd(A1, B_0, C4, B_0); \
1317 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1318 } while(false)
1319
1328
1329 blB += pk*RhsProgress;
1330 blA += pk*2*Traits::LhsProgress;
1331
1332 EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1333 }
1334
1335 // process remaining peeled loop
1336 for(Index k=peeled_kc; k<depth; k++)
1337 {
1338 RhsPacket B_0, B1;
1340 blB += RhsProgress;
1341 blA += 2*Traits::LhsProgress;
1342 }
1343#undef EIGEN_GEBGP_ONESTEP
1344 ResPacket R0, R1;
1345 ResPacket alphav = pset1<ResPacket>(alpha);
1346
1347 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1348 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1349 traits.acc(C0, alphav, R0);
1350 traits.acc(C4, alphav, R1);
1351 r0.storePacket(0 * Traits::ResPacketSize, R0);
1352 r0.storePacket(1 * Traits::ResPacketSize, R1);
1353 }
1354 }
1355 }
1356 }
1357 //---------- Process 1 * LhsProgress rows at once ----------
1358 if(mr>=1*Traits::LhsProgress)
1359 {
1360 // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1361 for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1362 {
1363 // loops on each largest micro vertical panel of rhs (depth * nr)
1364 for(Index j2=0; j2<packet_cols4; j2+=nr)
1365 {
1366 // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1367 // stored into 1 x nr registers.
1368
1369 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1370 prefetch(&blA[0]);
1371
1372 // gets res block as register
1373 AccPacket C0, C1, C2, C3;
1374 traits.initAcc(C0);
1375 traits.initAcc(C1);
1376 traits.initAcc(C2);
1377 traits.initAcc(C3);
1378
1379 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1380 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1381 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1382 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1383
1384 r0.prefetch(prefetch_res_offset);
1385 r1.prefetch(prefetch_res_offset);
1386 r2.prefetch(prefetch_res_offset);
1387 r3.prefetch(prefetch_res_offset);
1388
1389 // performs "inner" products
1390 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1391 prefetch(&blB[0]);
1392 LhsPacket A0;
1393
1394 for(Index k=0; k<peeled_kc; k+=pk)
1395 {
1396 EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1397 RhsPacket B_0, B1, B2, B3;
1398
1399#define EIGEN_GEBGP_ONESTEP(K) \
1400 do { \
1401 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1402 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1403 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1404 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1405 traits.madd(A0, B_0, C0, B_0); \
1406 traits.madd(A0, B1, C1, B1); \
1407 traits.madd(A0, B2, C2, B2); \
1408 traits.madd(A0, B3, C3, B3); \
1409 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1410 } while(false)
1411
1412 internal::prefetch(blB+(48+0));
1417 internal::prefetch(blB+(48+16));
1422
1423 blB += pk*4*RhsProgress;
1424 blA += pk*1*LhsProgress;
1425
1426 EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1427 }
1428 // process remaining peeled loop
1429 for(Index k=peeled_kc; k<depth; k++)
1430 {
1431 RhsPacket B_0, B1, B2, B3;
1433 blB += 4*RhsProgress;
1434 blA += 1*LhsProgress;
1435 }
1436#undef EIGEN_GEBGP_ONESTEP
1437
1438 ResPacket R0, R1;
1439 ResPacket alphav = pset1<ResPacket>(alpha);
1440
1441 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1442 R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1443 traits.acc(C0, alphav, R0);
1444 traits.acc(C1, alphav, R1);
1445 r0.storePacket(0 * Traits::ResPacketSize, R0);
1446 r1.storePacket(0 * Traits::ResPacketSize, R1);
1447
1448 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1449 R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1450 traits.acc(C2, alphav, R0);
1451 traits.acc(C3, alphav, R1);
1452 r2.storePacket(0 * Traits::ResPacketSize, R0);
1453 r3.storePacket(0 * Traits::ResPacketSize, R1);
1454 }
1455
1456 // Deal with remaining columns of the rhs
1457 for(Index j2=packet_cols4; j2<cols; j2++)
1458 {
1459 // One column at a time
1460 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1461 prefetch(&blA[0]);
1462
1463 // gets res block as register
1464 AccPacket C0;
1465 traits.initAcc(C0);
1466
1467 LinearMapper r0 = res.getLinearMapper(i, j2);
1468
1469 // performs "inner" products
1470 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1471 LhsPacket A0;
1472
1473 for(Index k=0; k<peeled_kc; k+=pk)
1474 {
1475 EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1476 RhsPacket B_0;
1477
1478#define EIGEN_GEBGP_ONESTEP(K) \
1479 do { \
1480 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1481 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1482 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1483 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1484 traits.madd(A0, B_0, C0, B_0); \
1485 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1486 } while(false);
1487
1496
1497 blB += pk*RhsProgress;
1498 blA += pk*1*Traits::LhsProgress;
1499
1500 EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1501 }
1502
1503 // process remaining peeled loop
1504 for(Index k=peeled_kc; k<depth; k++)
1505 {
1506 RhsPacket B_0;
1508 blB += RhsProgress;
1509 blA += 1*Traits::LhsProgress;
1510 }
1511#undef EIGEN_GEBGP_ONESTEP
1512 ResPacket R0;
1513 ResPacket alphav = pset1<ResPacket>(alpha);
1514 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1515 traits.acc(C0, alphav, R0);
1516 r0.storePacket(0 * Traits::ResPacketSize, R0);
1517 }
1518 }
1519 }
1520 //---------- Process remaining rows, 1 at once ----------
1521 if(peeled_mc1<rows)
1522 {
1523 // loop on each panel of the rhs
1524 for(Index j2=0; j2<packet_cols4; j2+=nr)
1525 {
1526 // loop on each row of the lhs (1*LhsProgress x depth)
1527 for(Index i=peeled_mc1; i<rows; i+=1)
1528 {
1529 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1530 prefetch(&blA[0]);
1531 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1532
1533 // The following piece of code wont work for 512 bit registers
1534 // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
1535 // as nr (which is currently 4) for the return type.
1536 const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1537 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1539 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
1540 {
1541 SAccPacket C0, C1, C2, C3;
1542 straits.initAcc(C0);
1543 straits.initAcc(C1);
1544 straits.initAcc(C2);
1545 straits.initAcc(C3);
1546
1547 const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1548 const Index endk = (depth/spk)*spk;
1549 const Index endk4 = (depth/(spk*4))*(spk*4);
1550
1551 Index k=0;
1552 for(; k<endk4; k+=4*spk)
1553 {
1554 SLhsPacket A0,A1;
1555 SRhsPacket B_0,B_1;
1556
1557 straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1558 straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1559
1560 straits.loadRhsQuad(blA+0*spk, B_0);
1561 straits.loadRhsQuad(blA+1*spk, B_1);
1562 straits.madd(A0,B_0,C0,B_0);
1563 straits.madd(A1,B_1,C1,B_1);
1564
1565 straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1566 straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1567 straits.loadRhsQuad(blA+2*spk, B_0);
1568 straits.loadRhsQuad(blA+3*spk, B_1);
1569 straits.madd(A0,B_0,C2,B_0);
1570 straits.madd(A1,B_1,C3,B_1);
1571
1573 blA += 4*spk;
1574 }
1575 C0 = padd(padd(C0,C1),padd(C2,C3));
1576 for(; k<endk; k+=spk)
1577 {
1578 SLhsPacket A0;
1579 SRhsPacket B_0;
1580
1581 straits.loadLhsUnaligned(blB, A0);
1582 straits.loadRhsQuad(blA, B_0);
1583 straits.madd(A0,B_0,C0,B_0);
1584
1586 blA += spk;
1587 }
1589 {
1590 // Special case where we have to first reduce the accumulation register C0
1591 typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1592 typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1593 typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1594 typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1595
1596 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1597 SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1598
1599 if(depth-endk>0)
1600 {
1601 // We have to handle the last row of the rhs which corresponds to a half-packet
1602 SLhsPacketHalf a0;
1603 SRhsPacketHalf b0;
1604 straits.loadLhsUnaligned(blB, a0);
1605 straits.loadRhs(blA, b0);
1606 SAccPacketHalf c0 = predux_downto4(C0);
1607 straits.madd(a0,b0,c0,b0);
1608 straits.acc(c0, alphav, R);
1609 }
1610 else
1611 {
1612 straits.acc(predux_downto4(C0), alphav, R);
1613 }
1614 res.scatterPacket(i, j2, R);
1615 }
1616 else
1617 {
1618 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1619 SResPacket alphav = pset1<SResPacket>(alpha);
1620 straits.acc(C0, alphav, R);
1621 res.scatterPacket(i, j2, R);
1622 }
1623 }
1624 else // scalar path
1625 {
1626 // get a 1 x 4 res block as registers
1627 ResScalar C0(0), C1(0), C2(0), C3(0);
1628
1629 for(Index k=0; k<depth; k++)
1630 {
1631 LhsScalar A0;
1632 RhsScalar B_0, B_1;
1633
1634 A0 = blA[k];
1635
1636 B_0 = blB[0];
1637 B_1 = blB[1];
1638 CJMADD(cj,A0,B_0,C0, B_0);
1639 CJMADD(cj,A0,B_1,C1, B_1);
1640
1641 B_0 = blB[2];
1642 B_1 = blB[3];
1643 CJMADD(cj,A0,B_0,C2, B_0);
1644 CJMADD(cj,A0,B_1,C3, B_1);
1645
1646 blB += 4;
1647 }
1648 res(i, j2 + 0) += alpha * C0;
1649 res(i, j2 + 1) += alpha * C1;
1650 res(i, j2 + 2) += alpha * C2;
1651 res(i, j2 + 3) += alpha * C3;
1652 }
1653 }
1654 }
1655 // remaining columns
1656 for(Index j2=packet_cols4; j2<cols; j2++)
1657 {
1658 // loop on each row of the lhs (1*LhsProgress x depth)
1659 for(Index i=peeled_mc1; i<rows; i+=1)
1660 {
1661 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1662 prefetch(&blA[0]);
1663 // gets a 1 x 1 res block as registers
1664 ResScalar C0(0);
1665 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1666 for(Index k=0; k<depth; k++)
1667 {
1668 LhsScalar A0 = blA[k];
1669 RhsScalar B_0 = blB[k];
1670 CJMADD(cj, A0, B_0, C0, B_0);
1671 }
1672 res(i, j2) += alpha * C0;
1673 }
1674 }
1675 }
1676 }
#define CJMADD(CJ, A, B, C, T)
Definition GeneralBlockPanelKernel.h:336
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_GEBP_ONESTEP(K)
#define EIGEN_ASM_COMMENT(X)
Definition Macros.h:624
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:151
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition GenericPacketMath.h:299
const std::ptrdiff_t defaultL1CacheSize
Definition GeneralBlockPanelKernel.h:33
EIGEN_DEVICE_FUNC conditional<(unpacket_traits< Packet >::size%8)==0, typenameunpacket_traits< Packet >::half, Packet >::type predux_downto4(const Packet &a)
Definition GenericPacketMath.h:332
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:33
size_t cols(const T &raster)
Definition MarchingSquares.hpp:60
size_t rows(const T &raster)
Definition MarchingSquares.hpp:55
DataMapper::LinearMapper LinearMapper
Definition GeneralBlockPanelKernel.h:875
Traits::RhsPacket RhsPacket
Definition GeneralBlockPanelKernel.h:864
SwappedTraits::ResPacket SResPacket
Definition GeneralBlockPanelKernel.h:872
SwappedTraits::AccPacket SAccPacket
Definition GeneralBlockPanelKernel.h:873
SwappedTraits::RhsPacket SRhsPacket
Definition GeneralBlockPanelKernel.h:871
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs > Traits
Definition GeneralBlockPanelKernel.h:861
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs > SwappedTraits
Definition GeneralBlockPanelKernel.h:868
Traits::LhsPacket LhsPacket
Definition GeneralBlockPanelKernel.h:863
Traits::ResScalar ResScalar
Definition GeneralBlockPanelKernel.h:862
SwappedTraits::LhsPacket SLhsPacket
Definition GeneralBlockPanelKernel.h:870
Traits::ResPacket ResPacket
Definition GeneralBlockPanelKernel.h:865
Traits::AccPacket AccPacket
Definition GeneralBlockPanelKernel.h:866
T half
Definition XprHelper.h:161

References Eigen::internal::gebp_traits< _LhsScalar, _RhsScalar, _ConjLhs, _ConjRhs >::acc(), CJMADD, Eigen::internal::defaultL1CacheSize, EIGEN_ASM_COMMENT, EIGEN_GEBGP_ONESTEP, EIGEN_GEBP_ONESTEP, Eigen::internal::gebp_traits< _LhsScalar, _RhsScalar, _ConjLhs, _ConjRhs >::initAcc(), Eigen::internal::gebp_traits< _LhsScalar, _RhsScalar, _ConjLhs, _ConjRhs >::loadLhsUnaligned(), Eigen::internal::gebp_traits< _LhsScalar, _RhsScalar, _ConjLhs, _ConjRhs >::loadRhs(), Eigen::internal::gebp_traits< _LhsScalar, _RhsScalar, _ConjLhs, _ConjRhs >::loadRhsQuad(), Eigen::internal::gebp_traits< _LhsScalar, _RhsScalar, _ConjLhs, _ConjRhs >::madd(), Eigen::internal::padd(), Eigen::internal::predux_downto4(), and Eigen::internal::prefetch().

+ Here is the call graph for this function:

The documentation for this struct was generated from the following file: