@@ -223,6 +223,37 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
223223}
224224EXPORT_SYMBOL_NS_GPL (cxl_dpa_debug , "CXL" );
225225
226+ /* See request_skip() kernel-doc */
227+ static resource_size_t __adjust_skip (struct cxl_dev_state * cxlds ,
228+ const resource_size_t skip_base ,
229+ const resource_size_t skip_len ,
230+ const char * requester )
231+ {
232+ const resource_size_t skip_end = skip_base + skip_len - 1 ;
233+
234+ for (int i = 0 ; i < cxlds -> nr_partitions ; i ++ ) {
235+ const struct resource * part_res = & cxlds -> part [i ].res ;
236+ resource_size_t adjust_start , adjust_end , size ;
237+
238+ adjust_start = max (skip_base , part_res -> start );
239+ adjust_end = min (skip_end , part_res -> end );
240+
241+ if (adjust_end < adjust_start )
242+ continue ;
243+
244+ size = adjust_end - adjust_start + 1 ;
245+
246+ if (!requester )
247+ __release_region (& cxlds -> dpa_res , adjust_start , size );
248+ else if (!__request_region (& cxlds -> dpa_res , adjust_start , size ,
249+ requester , 0 ))
250+ return adjust_start - skip_base ;
251+ }
252+
253+ return skip_len ;
254+ }
255+ #define release_skip (c , b , l ) __adjust_skip((c), (b), (l), NULL)
256+
226257/*
227258 * Must be called in a context that synchronizes against this decoder's
228259 * port ->remove() callback (like an endpoint decoder sysfs attribute)
@@ -241,7 +272,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
241272 skip_start = res -> start - cxled -> skip ;
242273 __release_region (& cxlds -> dpa_res , res -> start , resource_size (res ));
243274 if (cxled -> skip )
244- __release_region ( & cxlds -> dpa_res , skip_start , cxled -> skip );
275+ release_skip ( cxlds , skip_start , cxled -> skip );
245276 cxled -> skip = 0 ;
246277 cxled -> dpa_res = NULL ;
247278 put_device (& cxled -> cxld .dev );
@@ -268,6 +299,58 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
268299 __cxl_dpa_release (cxled );
269300}
270301
302+ /**
303+ * request_skip() - Track DPA 'skip' in @cxlds->dpa_res resource tree
304+ * @cxlds: CXL.mem device context that parents @cxled
305+ * @cxled: Endpoint decoder establishing new allocation that skips lower DPA
306+ * @skip_base: DPA < start of new DPA allocation (DPAnew)
307+ * @skip_len: @skip_base + @skip_len == DPAnew
308+ *
309+ * DPA 'skip' arises from out-of-sequence DPA allocation events relative
310+ * to free capacity across multiple partitions. It is a wasteful event
311+ * as usable DPA gets thrown away, but if a deployment has, for example,
312+ * a dual RAM+PMEM device, wants to use PMEM, and has unallocated RAM
313+ * DPA, the free RAM DPA must be sacrificed to start allocating PMEM.
314+ * See third "Implementation Note" in CXL 3.1 8.2.4.19.13 "Decoder
315+ * Protection" for more details.
316+ *
317+ * A 'skip' always covers the last allocated DPA in a previous partition
318+ * to the start of the current partition to allocate. Allocations never
319+ * start in the middle of a partition, and allocations are always
320+ * de-allocated in reverse order (see cxl_dpa_free(), or natural devm
321+ * unwind order from forced in-order allocation).
322+ *
323+ * If @cxlds->nr_partitions was guaranteed to be <= 2 then the 'skip'
324+ * would always be contained to a single partition. Given
325+ * @cxlds->nr_partitions may be > 2 it results in cases where the 'skip'
326+ * might span "tail capacity of partition[0], all of partition[1], ...,
327+ * all of partition[N-1]" to support allocating from partition[N]. That
328+ * in turn interacts with the partition 'struct resource' boundaries
329+ * within @cxlds->dpa_res whereby 'skip' requests need to be divided by
330+ * partition. I.e. this is a quirk of using a 'struct resource' tree to
331+ * detect range conflicts while also tracking partition boundaries in
332+ * @cxlds->dpa_res.
333+ */
334+ static int request_skip (struct cxl_dev_state * cxlds ,
335+ struct cxl_endpoint_decoder * cxled ,
336+ const resource_size_t skip_base ,
337+ const resource_size_t skip_len )
338+ {
339+ resource_size_t skipped = __adjust_skip (cxlds , skip_base , skip_len ,
340+ dev_name (& cxled -> cxld .dev ));
341+
342+ if (skipped == skip_len )
343+ return 0 ;
344+
345+ dev_dbg (cxlds -> dev ,
346+ "%s: failed to reserve skipped space (%pa %pa %pa)\n" ,
347+ dev_name (& cxled -> cxld .dev ), & skip_base , & skip_len , & skipped );
348+
349+ release_skip (cxlds , skip_base , skipped );
350+
351+ return - EBUSY ;
352+ }
353+
271354static int __cxl_dpa_reserve (struct cxl_endpoint_decoder * cxled ,
272355 resource_size_t base , resource_size_t len ,
273356 resource_size_t skipped )
@@ -276,7 +359,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
276359 struct cxl_port * port = cxled_to_port (cxled );
277360 struct cxl_dev_state * cxlds = cxlmd -> cxlds ;
278361 struct device * dev = & port -> dev ;
362+ enum cxl_decoder_mode mode ;
279363 struct resource * res ;
364+ int rc ;
280365
281366 lockdep_assert_held_write (& cxl_dpa_rwsem );
282367
@@ -305,37 +390,33 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
305390 }
306391
307392 if (skipped ) {
308- res = __request_region (& cxlds -> dpa_res , base - skipped , skipped ,
309- dev_name (& cxled -> cxld .dev ), 0 );
310- if (!res ) {
311- dev_dbg (dev ,
312- "decoder%d.%d: failed to reserve skipped space\n" ,
313- port -> id , cxled -> cxld .id );
314- return - EBUSY ;
315- }
393+ rc = request_skip (cxlds , cxled , base - skipped , skipped );
394+ if (rc )
395+ return rc ;
316396 }
317397 res = __request_region (& cxlds -> dpa_res , base , len ,
318398 dev_name (& cxled -> cxld .dev ), 0 );
319399 if (!res ) {
320400 dev_dbg (dev , "decoder%d.%d: failed to reserve allocation\n" ,
321401 port -> id , cxled -> cxld .id );
322402 if (skipped )
323- __release_region (& cxlds -> dpa_res , base - skipped ,
324- skipped );
403+ release_skip (cxlds , base - skipped , skipped );
325404 return - EBUSY ;
326405 }
327406 cxled -> dpa_res = res ;
328407 cxled -> skip = skipped ;
329408
330- if (to_pmem_res (cxlds ) && resource_contains (to_pmem_res (cxlds ), res ))
331- cxled -> mode = CXL_DECODER_PMEM ;
332- else if (to_ram_res (cxlds ) && resource_contains (to_ram_res (cxlds ), res ))
333- cxled -> mode = CXL_DECODER_RAM ;
334- else {
409+ mode = CXL_DECODER_NONE ;
410+ for (int i = 0 ; cxlds -> nr_partitions ; i ++ )
411+ if (resource_contains (& cxlds -> part [i ].res , res )) {
412+ mode = cxl_part_mode (cxlds -> part [i ].mode );
413+ break ;
414+ }
415+
416+ if (mode == CXL_DECODER_NONE )
335417 dev_warn (dev , "decoder%d.%d: %pr does not map any partition\n" ,
336418 port -> id , cxled -> cxld .id , res );
337- cxled -> mode = CXL_DECODER_NONE ;
338- }
419+ cxled -> mode = mode ;
339420
340421 port -> hdm_end ++ ;
341422 get_device (& cxled -> cxld .dev );
@@ -542,15 +623,13 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
542623int cxl_dpa_alloc (struct cxl_endpoint_decoder * cxled , unsigned long long size )
543624{
544625 struct cxl_memdev * cxlmd = cxled_to_memdev (cxled );
545- resource_size_t free_ram_start , free_pmem_start ;
546626 struct cxl_port * port = cxled_to_port (cxled );
547627 struct cxl_dev_state * cxlds = cxlmd -> cxlds ;
548628 struct device * dev = & cxled -> cxld .dev ;
549- resource_size_t start , avail , skip ;
629+ struct resource * res , * prev = NULL ;
630+ resource_size_t start , avail , skip , skip_start ;
550631 struct resource * p , * last ;
551- const struct resource * ram_res = to_ram_res (cxlds );
552- const struct resource * pmem_res = to_pmem_res (cxlds );
553- int rc ;
632+ int part , rc ;
554633
555634 down_write (& cxl_dpa_rwsem );
556635 if (cxled -> cxld .region ) {
@@ -566,47 +645,53 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
566645 goto out ;
567646 }
568647
569- for (p = ram_res -> child , last = NULL ; p ; p = p -> sibling )
570- last = p ;
571- if (last )
572- free_ram_start = last -> end + 1 ;
573- else
574- free_ram_start = ram_res -> start ;
648+ part = -1 ;
649+ for (int i = 0 ; i < cxlds -> nr_partitions ; i ++ ) {
650+ if (cxled -> mode == cxl_part_mode (cxlds -> part [i ].mode )) {
651+ part = i ;
652+ break ;
653+ }
654+ }
655+
656+ if (part < 0 ) {
657+ rc = - EBUSY ;
658+ goto out ;
659+ }
575660
576- for (p = pmem_res -> child , last = NULL ; p ; p = p -> sibling )
661+ res = & cxlds -> part [part ].res ;
662+ for (p = res -> child , last = NULL ; p ; p = p -> sibling )
577663 last = p ;
578664 if (last )
579- free_pmem_start = last -> end + 1 ;
665+ start = last -> end + 1 ;
580666 else
581- free_pmem_start = pmem_res -> start ;
667+ start = res -> start ;
582668
583- if (cxled -> mode == CXL_DECODER_RAM ) {
584- start = free_ram_start ;
585- avail = ram_res -> end - start + 1 ;
586- skip = 0 ;
587- } else if (cxled -> mode == CXL_DECODER_PMEM ) {
588- resource_size_t skip_start , skip_end ;
589-
590- start = free_pmem_start ;
591- avail = pmem_res -> end - start + 1 ;
592- skip_start = free_ram_start ;
593-
594- /*
595- * If some pmem is already allocated, then that allocation
596- * already handled the skip.
597- */
598- if (pmem_res -> child &&
599- skip_start == pmem_res -> child -> start )
600- skip_end = skip_start - 1 ;
601- else
602- skip_end = start - 1 ;
603- skip = skip_end - skip_start + 1 ;
604- } else {
605- dev_dbg (dev , "mode not set\n" );
606- rc = - EINVAL ;
607- goto out ;
669+ /*
670+ * To allocate at partition N, a skip needs to be calculated for all
671+ * unallocated space at lower partitions indices.
672+ *
673+ * If a partition has any allocations, the search can end because a
674+ * previous cxl_dpa_alloc() invocation is assumed to have accounted for
675+ * all previous partitions.
676+ */
677+ skip_start = CXL_RESOURCE_NONE ;
678+ for (int i = part ; i ; i -- ) {
679+ prev = & cxlds -> part [i - 1 ].res ;
680+ for (p = prev -> child , last = NULL ; p ; p = p -> sibling )
681+ last = p ;
682+ if (last ) {
683+ skip_start = last -> end + 1 ;
684+ break ;
685+ }
686+ skip_start = prev -> start ;
608687 }
609688
689+ avail = res -> end - start + 1 ;
690+ if (skip_start == CXL_RESOURCE_NONE )
691+ skip = 0 ;
692+ else
693+ skip = res -> start - skip_start ;
694+
610695 if (size > avail ) {
611696 dev_dbg (dev , "%pa exceeds available %s capacity: %pa\n" , & size ,
612697 cxl_decoder_mode_name (cxled -> mode ), & avail );
0 commit comments