文件系统中数据读取的详细过程

    xiaoxiao2025-06-02  34

    一直在困惑文件系统sys_read、bio、io调度、硬中断、软中断、io完成通知之间的过程是怎么样的,通过代码的跟踪大致明白过程了

    内核态的系统调用与bio的交界处的函数mpage_bio_submit(fs/mpage.c)

    bio与io调度的交界处的函数__make_request(block/blk-core.c)

    io调度与驱动层的交界函数__generic_unplug_device(block/blk-core.c)

    io的返回路径

        在驱动层的io完成之后,注册的设备完成方法中(例如:scsi_done,__scsi_done)中则会执行blk_complete_request

         blk_complete_request是硬件的设备驱动的硬件中断上下文的最后一个函数了

         blk_complete_request将会启动软中断BLOCK_SOFTIRQ

         这个过程向上的通知的过程到达bio层的时候,调用的函数就是 mpage_bio_submit中注册的bio的end_io(mpage_end_io_read| mpage_end_io_write)

    代码流程

    Fs/mpage.c static struct bio * do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, struct buffer_head *map_bh, unsigned long *first_logical_block, get_block_t get_block) { struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; unsigned first_hole = blocks_per_page; struct block_device *bdev = NULL; int length; int fully_mapped = 1; unsigned nblocks; unsigned relative_block; if (page_has_buffers(page)) goto confused; block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; nblocks = map_bh->b_size >> blkbits; if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && block_in_file < (*first_logical_block + nblocks)) { unsigned map_offset = block_in_file - *first_logical_block; unsigned last = nblocks - map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { clear_buffer_mapped(map_bh); break; } if (page_block == blocks_per_page) break; blocks[page_block] = map_bh->b_blocknr + map_offset + relative_block; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } map_bh->b_page = page; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; if (block_in_file < last_block) { map_bh->b_size = (last_block-block_in_file) << blkbits; if (get_block(inode, block_in_file, map_bh, 0)) goto confused; *first_logical_block = block_in_file; } if (!buffer_mapped(map_bh)) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } if (buffer_uptodate(map_bh)) { map_buffer_to_page(page, map_bh, page_block); goto confused; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) goto confused; nblocks = map_bh->b_size >> blkbits; for (relative_block = 0; ; relative_block++) { if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; } else if (page_block == blocks_per_page) break; blocks[page_block] = map_bh->b_blocknr+relative_block; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } if (first_hole != blocks_per_page) { zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); goto out; } } else if (fully_mapped) { SetPageMappedToDisk(page); } /* * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && (*last_block_in_bio != blocks[0] - 1)) bio = mpage_bio_submit(READ, bio); alloc_new: if (bio == NULL) { bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, nr_pages, bio_get_nr_vecs(bdev)), GFP_KERNEL); if (bio == NULL) goto confused; } length = first_hole << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(READ, bio); goto alloc_new; } relative_block = block_in_file - *first_logical_block; nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) bio = mpage_bio_submit(READ, bio); else *last_block_in_bio = blocks[blocks_per_page - 1]; out: return bio; confused: if (bio) bio = mpage_bio_submit(READ, bio); if (!PageUptodate(page)) block_read_full_page(page, get_block); else unlock_page(page); goto out; }

    do_mpage_readpage主要是将page转换为bio

    do_mpage_readpage中重点关注mpage_bio_submit和block_read_full_page Fs/buffer.c int block_read_full_page(struct page *page, get_block_t *get_block) { struct inode *inode = page->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize; int nr, i; int fully_mapped = 1; BUG_ON(!PageLocked(page)); blocksize = 1 << inode->i_blkbits; if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); head = page_buffers(page); iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; bh = head; nr = 0; i = 0; do { if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { int err = 0; fully_mapped = 0; if (iblock < lblock) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); if (err) SetPageError(page); } if (!buffer_mapped(bh)) { zero_user(page, i * blocksize, blocksize); if (!err) set_buffer_uptodate(bh); continue; } if (buffer_uptodate(bh)) continue; } arr[nr++] = bh; } while (i++, iblock++, (bh = bh->b_this_page) != head); if (fully_mapped) SetPageMappedToDisk(page); if (!nr) { if (!PageError(page)) SetPageUptodate(page); unlock_page(page); return 0; } /* Stage two: lock the buffers */ for (i = 0; i < nr; i++) { bh = arr[i]; lock_buffer(bh); mark_buffer_async_read(bh); } for (i = 0; i < nr; i++) { bh = arr[i]; if (buffer_uptodate(bh)) end_buffer_async_read(bh, 1); else submit_bh(READ, bh); } return 0; } 重点关注submit_bh

    Fs/buffer.c int submit_bh(int rw, struct buffer_head * bh) { struct bio *bio; int ret = 0; if (buffer_ordered(bh) && (rw & WRITE)) rw |= WRITE_BARRIER; if (test_set_buffer_req(bh) && (rw & WRITE)) clear_buffer_write_io_error(bh); bio = bio_alloc(GFP_NOIO, 1); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_io_vec[0].bv_page = bh->b_page; bio->bi_io_vec[0].bv_len = bh->b_size; bio->bi_io_vec[0].bv_offset = bh_offset(bh); bio->bi_vcnt = 1; bio->bi_idx = 0; bio->bi_size = bh->b_size; bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; bio_get(bio); submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; bio_put(bio); return ret; } do_mpage_readpage中执行mpage_bio_submit

    static struct bio *mpage_bio_submit(int rw,struct bio *bio) { bio->bi_end_io= mpage_end_io_read; if(rw == WRITE) bio->bi_end_io= mpage_end_io_write; submit_bio(rw, bio); returnNULL; } Block/blk-core.c void submit_bio(int rw, struct bio *bio) { int count = bio_sectors(bio); bio->bi_rw |= rw; if (bio_has_data(bio)) { if (rw & WRITE) { count_vm_events(PGPGOUT, count); } else { task_io_account_read(bio->bi_size); count_vm_events(PGPGIN, count); } if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_sector, bdevname(bio->bi_bdev, b)); } } generic_make_request(bio); }generic_make_request将会把请求递交给io调度层

    void generic_make_request(struct bio *bio) { if (current->bio_tail) { /* make_request is active */ *(current->bio_tail) = bio; bio->bi_next = NULL; current->bio_tail = &bio->bi_next; return; } BUG_ON(bio->bi_next); do { current->bio_list = bio->bi_next; if (bio->bi_next == NULL) current->bio_tail = ¤t->bio_list; else bio->bi_next = NULL; __generic_make_request(bio); bio = current->bio_list; } while (bio); current->bio_tail = NULL; /* deactivate */ } static inline void __generic_make_request(struct bio *bio) { struct request_queue *q; sector_t old_sector; int ret, nr_sectors = bio_sectors(bio); dev_t old_dev; int err = -EIO; might_sleep(); if (bio_check_eod(bio, nr_sectors)) goto end_io; old_sector = -1; old_dev = 0; do { char b[BDEVNAME_SIZE]; q = bdev_get_queue(bio->bi_bdev); if (unlikely(!q)) { goto end_io; } if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) && nr_sectors > queue_max_hw_sectors(q))) { goto end_io; } if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) goto end_io; if (should_fail_request(bio)) goto end_io; blk_partition_remap(bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) goto end_io; if (old_sector != -1) trace_block_remap(q, bio, old_dev, old_sector); old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; if (bio_check_eod(bio, nr_sectors)) goto end_io; if (bio_rw_flagged(bio, BIO_RW_DISCARD) && !blk_queue_discard(q)) { err = -EOPNOTSUPP; goto end_io; } trace_block_bio_queue(q, bio); ret = q->make_request_fn(q, bio); } while (ret); return; end_io: bio_endio(bio, err); }

    make_request_fn是何时指定的呢?

    需要关注请求如何从page=>bh=>bio=>request=>elevator

    void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) { q->nr_requests = BLKDEV_MAX_RQ; q->make_request_fn = mfn; blk_queue_dma_alignment(q, 511); blk_queue_congestion_threshold(q); q->nr_batching = BLK_BATCH_REQ; q->unplug_thresh = 4; /* hmm */ q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ if (q->unplug_delay == 0) q->unplug_delay = 1; q->unplug_timer.function = blk_unplug_timeout; q->unplug_timer.data = (unsigned long)q; blk_set_default_limits(&q->limits); blk_queue_max_sectors(q, SAFE_MAX_SECTORS); if (!q->queue_lock) q->queue_lock = &q->__queue_lock; blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); } Block/blk-core.c struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); if (!q) return NULL; q->node = node_id; if (blk_init_free_list(q)) { kmem_cache_free(blk_requestq_cachep, q); return NULL; } q->request_fn = rfn; q->prep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; q->queue_flags = QUEUE_FLAG_DEFAULT; q->queue_lock = lock; blk_queue_make_request(q, __make_request); q->sg_reserved_size = INT_MAX; if (!elevator_init(q, NULL)) { blk_queue_congestion_threshold(q); return q; } blk_put_queue(q); return NULL; } Block/blk-core.c static int __make_request(struct request_queue *q, struct bio *bio) { struct request *req; int el_ret; unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG); const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; if (bio_rw_flagged(bio, BIO_RW_BARRIER) && (q->next_ordered == QUEUE_ORDERED_NONE)) { bio_endio(bio, -EOPNOTSUPP); return 0; } blk_queue_bounce(q, &bio); spin_lock_irq(q->queue_lock); if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); switch (el_ret) { case ELEVATOR_BACK_MERGE: BUG_ON(!rq_mergeable(req)); if (!ll_back_merge_fn(q, req, bio)) break; trace_block_bio_backmerge(q, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bytes; req->ioprio = ioprio_best(req->ioprio, prio); if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; case ELEVATOR_FRONT_MERGE: BUG_ON(!rq_mergeable(req)); if (!ll_front_merge_fn(q, req, bio)) break; trace_block_bio_frontmerge(q, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { blk_rq_set_mixed_merge(req); req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= ff; } bio->bi_next = req->bio; req->bio = bio; req->buffer = bio_data(bio); req->__sector = bio->bi_sector; req->__data_len += bytes; req->ioprio = ioprio_best(req->ioprio, prio); if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; default: ; } get_rq: rw_flags = bio_data_dir(bio); if (sync) rw_flags |= REQ_RW_SYNC; req = get_request_wait(q, rw_flags, bio); init_request_from_bio(req, bio); spin_lock_irq(q->queue_lock); if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || bio_flagged(bio, BIO_CPU_AFFINE)) req->cpu = blk_cpu_to_group(smp_processor_id()); if (queue_should_plug(q) && elv_queue_empty(q)) blk_plug_device(q); add_request(q, req); out: if (unplug || !queue_should_plug(q)) __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); return 0; } 特别关注add_request和__generic_unplug_device

    add_request将会执行电梯调度算法中的具体流程

    Block/blk-core.c static inline void add_request(struct request_queue *q, struct request *req) { drive_stat_acct(req, 1); __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); } Block/elevator.c void __elv_add_request(struct request_queue *q, struct request *rq, int where, int plug) { if (q->ordcolor) rq->cmd_flags |= REQ_ORDERED_COLOR; if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { if (blk_barrier_rq(rq)) q->ordcolor ^= 1; if (where == ELEVATOR_INSERT_SORT) where = ELEVATOR_INSERT_BACK; if (blk_fs_request(rq) || blk_discard_rq(rq)) { q->end_sector = rq_end_sector(rq); q->boundary_rq = rq; } } else if (!(rq->cmd_flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT) where = ELEVATOR_INSERT_BACK; if (plug) blk_plug_device(q); elv_insert(q, rq, where); } Block/elevator.c void elv_insert(struct request_queue *q, struct request *rq, int where) { struct list_head *pos; unsigned ordseq; int unplug_it = 1; trace_block_rq_insert(q, rq); rq->q = q; switch (where) { case ELEVATOR_INSERT_FRONT: rq->cmd_flags |= REQ_SOFTBARRIER; list_add(&rq->queuelist, &q->queue_head); break; case ELEVATOR_INSERT_BACK: rq->cmd_flags |= REQ_SOFTBARRIER; elv_drain_elevator(q); list_add_tail(&rq->queuelist, &q->queue_head); __blk_run_queue(q); break; case ELEVATOR_INSERT_SORT: BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq)); rq->cmd_flags |= REQ_SORTED; q->nr_sorted++; if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } q->elevator->ops->elevator_add_req_fn(q, rq); break; case ELEVATOR_INSERT_REQUEUE: rq->cmd_flags |= REQ_SOFTBARRIER; unplug_it = 0; if (q->ordseq == 0) { list_add(&rq->queuelist, &q->queue_head); break; } ordseq = blk_ordered_req_seq(rq); list_for_each(pos, &q->queue_head) { struct request *pos_rq = list_entry_rq(pos); if (ordseq <= blk_ordered_req_seq(pos_rq)) break; } list_add_tail(&rq->queuelist, pos); break; default: printk(KERN_ERR "%s: bad insertion point %d\n", __func__, where); BUG(); } if (unplug_it && blk_queue_plugged(q)) { int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC] - queue_in_flight(q); if (nrq >= q->unplug_thresh) _generic_unplug_device(q); } }

    从io调度层取出request是__generic_unplug_device完成

    Block/blk-core.c void __generic_unplug_device(struct request_queue *q) { if (unlikely(blk_queue_stopped(q))) return; if (!blk_remove_plug(q) && !blk_queue_nonrot(q)) return; q->request_fn(q);//设备函数,例如scsi设备 } request_fn是特定的设备函数,类似scsi,它将会通过scsi_dispatch_cmd将scisi指令发送到设备

    那么怎么知道io请求已经完成了呢?

    硬件驱动中也会提供io complete的函数,它们最终都会执行blk_complete_request

    Block/blk-softirq.c void blk_complete_request(struct request *req) { if (unlikely(blk_should_fake_timeout(req->q))) return; if (!blk_mark_rq_complete(req)) __blk_complete_request(req); } Block/blk-softirq.c void __blk_complete_request(struct request *req) { struct request_queue *q = req->q; unsigned long flags; int ccpu, cpu, group_cpu; BUG_ON(!q->softirq_done_fn); local_irq_save(flags); cpu = smp_processor_id(); group_cpu = blk_cpu_to_group(cpu); /* * Select completion CPU */ if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) ccpu = req->cpu; else ccpu = cpu; if (ccpu == cpu || ccpu == group_cpu) { struct list_head *list; do_local: list = &__get_cpu_var(blk_cpu_done); list_add_tail(&req->csd.list, list); if (list->next == &req->csd.list) raise_softirq_irqoff(BLOCK_SOFTIRQ); } else if (raise_blk_irq(ccpu, req)) goto do_local; local_irq_restore(flags); } blk_complete_request是硬件中断上下文的最后一个函数,它把最后io完成后需要完成的工作交给了软中断BLOCK_SOFTIRQ

    在将IO请求交给软中断处理后,驱动层完成处理后,将会执行mpage_bio_submit中注册的bio的end_io,它注册为mpage_end_io_read/ mpage_end_io_write

       

    转载请注明原文地址: https://ju.6miu.com/read-1299510.html
    最新回复(0)