@@ -596,6 +596,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
596596 ci -> i_truncate_seq = 0 ;
597597 ci -> i_truncate_size = 0 ;
598598 ci -> i_truncate_pending = 0 ;
599+ ci -> i_truncate_pagecache_size = 0 ;
599600
600601 ci -> i_max_size = 0 ;
601602 ci -> i_reported_size = 0 ;
@@ -767,6 +768,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
767768 dout ("truncate_size %lld -> %llu\n" , ci -> i_truncate_size ,
768769 truncate_size );
769770 ci -> i_truncate_size = truncate_size ;
771+ if (IS_ENCRYPTED (inode ))
772+ ci -> i_truncate_pagecache_size = size ;
773+ else
774+ ci -> i_truncate_pagecache_size = truncate_size ;
770775 }
771776 return queue_trunc ;
772777}
@@ -2147,7 +2152,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
21472152 /* there should be no reader or writer */
21482153 WARN_ON_ONCE (ci -> i_rd_ref || ci -> i_wr_ref );
21492154
2150- to = ci -> i_truncate_size ;
2155+ to = ci -> i_truncate_pagecache_size ;
21512156 wrbuffer_refs = ci -> i_wrbuffer_ref ;
21522157 dout ("__do_pending_vmtruncate %p (%d) to %lld\n" , inode ,
21532158 ci -> i_truncate_pending , to );
@@ -2157,7 +2162,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
21572162 truncate_pagecache (inode , to );
21582163
21592164 spin_lock (& ci -> i_ceph_lock );
2160- if (to == ci -> i_truncate_size ) {
2165+ if (to == ci -> i_truncate_pagecache_size ) {
21612166 ci -> i_truncate_pending = 0 ;
21622167 finish = 1 ;
21632168 }
@@ -2241,6 +2246,144 @@ static const struct inode_operations ceph_encrypted_symlink_iops = {
22412246 .listxattr = ceph_listxattr ,
22422247};
22432248
2249+ /*
2250+ * Transfer the encrypted last block to the MDS and the MDS
2251+ * will help update it when truncating a smaller size.
2252+ *
2253+ * We don't support a PAGE_SIZE that is smaller than the
2254+ * CEPH_FSCRYPT_BLOCK_SIZE.
2255+ */
2256+ static int fill_fscrypt_truncate (struct inode * inode ,
2257+ struct ceph_mds_request * req ,
2258+ struct iattr * attr )
2259+ {
2260+ struct ceph_inode_info * ci = ceph_inode (inode );
2261+ int boff = attr -> ia_size % CEPH_FSCRYPT_BLOCK_SIZE ;
2262+ loff_t pos , orig_pos = round_down (attr -> ia_size ,
2263+ CEPH_FSCRYPT_BLOCK_SIZE );
2264+ u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT ;
2265+ struct ceph_pagelist * pagelist = NULL ;
2266+ struct kvec iov = {0 };
2267+ struct iov_iter iter ;
2268+ struct page * page = NULL ;
2269+ struct ceph_fscrypt_truncate_size_header header ;
2270+ int retry_op = 0 ;
2271+ int len = CEPH_FSCRYPT_BLOCK_SIZE ;
2272+ loff_t i_size = i_size_read (inode );
2273+ int got , ret , issued ;
2274+ u64 objver ;
2275+
2276+ ret = __ceph_get_caps (inode , NULL , CEPH_CAP_FILE_RD , 0 , -1 , & got );
2277+ if (ret < 0 )
2278+ return ret ;
2279+
2280+ issued = __ceph_caps_issued (ci , NULL );
2281+
2282+ dout ("%s size %lld -> %lld got cap refs on %s, issued %s\n" , __func__ ,
2283+ i_size , attr -> ia_size , ceph_cap_string (got ),
2284+ ceph_cap_string (issued ));
2285+
2286+ /* Try to writeback the dirty pagecaches */
2287+ if (issued & (CEPH_CAP_FILE_BUFFER )) {
2288+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1 ;
2289+
2290+ ret = filemap_write_and_wait_range (inode -> i_mapping ,
2291+ orig_pos , lend );
2292+ if (ret < 0 )
2293+ goto out ;
2294+ }
2295+
2296+ page = __page_cache_alloc (GFP_KERNEL );
2297+ if (page == NULL ) {
2298+ ret = - ENOMEM ;
2299+ goto out ;
2300+ }
2301+
2302+ pagelist = ceph_pagelist_alloc (GFP_KERNEL );
2303+ if (!pagelist ) {
2304+ ret = - ENOMEM ;
2305+ goto out ;
2306+ }
2307+
2308+ iov .iov_base = kmap_local_page (page );
2309+ iov .iov_len = len ;
2310+ iov_iter_kvec (& iter , READ , & iov , 1 , len );
2311+
2312+ pos = orig_pos ;
2313+ ret = __ceph_sync_read (inode , & pos , & iter , & retry_op , & objver );
2314+ if (ret < 0 )
2315+ goto out ;
2316+
2317+ /* Insert the header first */
2318+ header .ver = 1 ;
2319+ header .compat = 1 ;
2320+ header .change_attr = cpu_to_le64 (inode_peek_iversion_raw (inode ));
2321+
2322+ /*
2323+ * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
2324+ * because in MDS it may need this to do the truncate.
2325+ */
2326+ header .block_size = cpu_to_le32 (CEPH_FSCRYPT_BLOCK_SIZE );
2327+
2328+ /*
2329+ * If we hit a hole here, we should just skip filling
2330+ * the fscrypt for the request, because once the fscrypt
2331+ * is enabled, the file will be split into many blocks
2332+ * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
2333+ * has a hole, the hole size should be multiple of block
2334+ * size.
2335+ *
2336+ * If the Rados object doesn't exist, it will be set to 0.
2337+ */
2338+ if (!objver ) {
2339+ dout ("%s hit hole, ppos %lld < size %lld\n" , __func__ ,
2340+ pos , i_size );
2341+
2342+ header .data_len = cpu_to_le32 (8 + 8 + 4 );
2343+ header .file_offset = 0 ;
2344+ ret = 0 ;
2345+ } else {
2346+ header .data_len = cpu_to_le32 (8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE );
2347+ header .file_offset = cpu_to_le64 (orig_pos );
2348+
2349+ /* truncate and zero out the extra contents for the last block */
2350+ memset (iov .iov_base + boff , 0 , PAGE_SIZE - boff );
2351+
2352+ /* encrypt the last block */
2353+ ret = ceph_fscrypt_encrypt_block_inplace (inode , page ,
2354+ CEPH_FSCRYPT_BLOCK_SIZE ,
2355+ 0 , block ,
2356+ GFP_KERNEL );
2357+ if (ret )
2358+ goto out ;
2359+ }
2360+
2361+ /* Insert the header */
2362+ ret = ceph_pagelist_append (pagelist , & header , sizeof (header ));
2363+ if (ret )
2364+ goto out ;
2365+
2366+ if (header .block_size ) {
2367+ /* Append the last block contents to pagelist */
2368+ ret = ceph_pagelist_append (pagelist , iov .iov_base ,
2369+ CEPH_FSCRYPT_BLOCK_SIZE );
2370+ if (ret )
2371+ goto out ;
2372+ }
2373+ req -> r_pagelist = pagelist ;
2374+ out :
2375+ dout ("%s %p size dropping cap refs on %s\n" , __func__ ,
2376+ inode , ceph_cap_string (got ));
2377+ ceph_put_cap_refs (ci , got );
2378+ if (iov .iov_base )
2379+ kunmap_local (iov .iov_base );
2380+ if (page )
2381+ __free_pages (page , 0 );
2382+ if (ret && pagelist )
2383+ ceph_pagelist_release (pagelist );
2384+ return ret ;
2385+ }
2386+
22442387int __ceph_setattr (struct inode * inode , struct iattr * attr ,
22452388 struct ceph_iattr * cia )
22462389{
@@ -2249,13 +2392,17 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
22492392 struct ceph_mds_request * req ;
22502393 struct ceph_mds_client * mdsc = ceph_sb_to_client (inode -> i_sb )-> mdsc ;
22512394 struct ceph_cap_flush * prealloc_cf ;
2395+ loff_t isize = i_size_read (inode );
22522396 int issued ;
22532397 int release = 0 , dirtied = 0 ;
22542398 int mask = 0 ;
22552399 int err = 0 ;
22562400 int inode_dirty_flags = 0 ;
22572401 bool lock_snap_rwsem = false;
2402+ bool fill_fscrypt ;
2403+ int truncate_retry = 20 ; /* The RMW will take around 50ms */
22582404
2405+ retry :
22592406 prealloc_cf = ceph_alloc_cap_flush ();
22602407 if (!prealloc_cf )
22612408 return - ENOMEM ;
@@ -2267,6 +2414,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
22672414 return PTR_ERR (req );
22682415 }
22692416
2417+ fill_fscrypt = false;
22702418 spin_lock (& ci -> i_ceph_lock );
22712419 issued = __ceph_caps_issued (ci , NULL );
22722420
@@ -2388,10 +2536,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
23882536 }
23892537 }
23902538 if (ia_valid & ATTR_SIZE ) {
2391- loff_t isize = i_size_read (inode );
2392-
23932539 dout ("setattr %p size %lld -> %lld\n" , inode , isize , attr -> ia_size );
2394- if ((issued & CEPH_CAP_FILE_EXCL ) && attr -> ia_size >= isize ) {
2540+ /*
2541+ * Only when the new size is smaller and not aligned to
2542+ * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
2543+ */
2544+ if (IS_ENCRYPTED (inode ) && attr -> ia_size < isize &&
2545+ (attr -> ia_size % CEPH_FSCRYPT_BLOCK_SIZE )) {
2546+ mask |= CEPH_SETATTR_SIZE ;
2547+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2548+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR ;
2549+ set_bit (CEPH_MDS_R_FSCRYPT_FILE , & req -> r_req_flags );
2550+ mask |= CEPH_SETATTR_FSCRYPT_FILE ;
2551+ req -> r_args .setattr .size =
2552+ cpu_to_le64 (round_up (attr -> ia_size ,
2553+ CEPH_FSCRYPT_BLOCK_SIZE ));
2554+ req -> r_args .setattr .old_size =
2555+ cpu_to_le64 (round_up (isize ,
2556+ CEPH_FSCRYPT_BLOCK_SIZE ));
2557+ req -> r_fscrypt_file = attr -> ia_size ;
2558+ fill_fscrypt = true;
2559+ } else if ((issued & CEPH_CAP_FILE_EXCL ) && attr -> ia_size >= isize ) {
23952560 if (attr -> ia_size > isize ) {
23962561 i_size_write (inode , attr -> ia_size );
23972562 inode -> i_blocks = calc_inode_blocks (attr -> ia_size );
@@ -2414,7 +2579,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
24142579 cpu_to_le64 (round_up (isize ,
24152580 CEPH_FSCRYPT_BLOCK_SIZE ));
24162581 req -> r_fscrypt_file = attr -> ia_size ;
2417- /* FIXME: client must zero out any partial blocks! */
24182582 } else {
24192583 req -> r_args .setattr .size = cpu_to_le64 (attr -> ia_size );
24202584 req -> r_args .setattr .old_size = cpu_to_le64 (isize );
@@ -2481,8 +2645,10 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
24812645
24822646 release &= issued ;
24832647 spin_unlock (& ci -> i_ceph_lock );
2484- if (lock_snap_rwsem )
2648+ if (lock_snap_rwsem ) {
24852649 up_read (& mdsc -> snap_rwsem );
2650+ lock_snap_rwsem = false;
2651+ }
24862652
24872653 if (inode_dirty_flags )
24882654 __mark_inode_dirty (inode , inode_dirty_flags );
@@ -2494,7 +2660,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr,
24942660 req -> r_args .setattr .mask = cpu_to_le32 (mask );
24952661 req -> r_num_caps = 1 ;
24962662 req -> r_stamp = attr -> ia_ctime ;
2663+ if (fill_fscrypt ) {
2664+ err = fill_fscrypt_truncate (inode , req , attr );
2665+ if (err )
2666+ goto out ;
2667+ }
2668+
2669+ /*
2670+ * The truncate request will return -EAGAIN when the
2671+ * last block has been updated just before the MDS
2672+ * successfully gets the xlock for the FILE lock. To
2673+ * avoid corrupting the file contents we need to retry
2674+ * it.
2675+ */
24972676 err = ceph_mdsc_do_request (mdsc , NULL , req );
2677+ if (err == - EAGAIN && truncate_retry -- ) {
2678+ dout ("setattr %p result=%d (%s locally, %d remote), retry it!\n" ,
2679+ inode , err , ceph_cap_string (dirtied ), mask );
2680+ ceph_mdsc_put_request (req );
2681+ ceph_free_cap_flush (prealloc_cf );
2682+ goto retry ;
2683+ }
24982684 }
24992685out :
25002686 dout ("setattr %p result=%d (%s locally, %d remote)\n" , inode , err ,
0 commit comments