每个块设备或者块设备的分区,都对应有自身的请求队列(request_queue),
而每个请求队列都可以选择一个IO调度器来协调所递交的request。
IO调度器的基本目的是将请求按照它们对应在块设备上的扇区号进行排列,以减少磁头的移动,提高效率。
每个request_queue都有一个request的队列,队列里的请求将按顺序被响应。
实际上,除了这个队列,每个调度器自身都维护有不同数量的队列,用来对递交上来的request进行处理,
而排在队列最前面的request将适时被移动到request_queue中等待响应。
Noop 调度器(no operations) : 是最简单的IO调度器,它本质上就是一个链表实现的 fifo 队列,并对请求进行简单的合并处理,。
此调度程序最适合于SSD,专为随机访问设备而设计
首先要了解描述elevator的数据结构。
和elevator相关的数据结构有个,一个是elevator_type,一个是elevator_queue,
前者对应一个调度器类型,后者对应一个调度器队列。
如果内核中只有上述四种类型的调度器,则就有四个elevator_type,但是多个块设备(分区)可拥有多个相应的elevator_queue。
两个数据结构中最关键的元素都是struct elevator_ops,该结构定义了一组操作函数,
用来描述请求队列的相关算法,实现对请求的处理。
./include/linux/elevator.h
8 typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
9 struct bio *);
10
11 typedef void (elevator_merge_req_fn) (struct request_queue *, struct request *, struct request *);
12
13 typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int);
14
15 typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *);
16
17 typedef int (elevator_dispatch_fn) (struct request_queue *, int);
18
19 typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
20 typedef int (elevator_queue_empty_fn) (struct request_queue *);
21 typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
22 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
23 typedef int (elevator_may_queue_fn) (struct request_queue *, int);
24
25 typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
26 typedef void (elevator_put_req_fn) (struct request *);
27 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
28 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
29
30 typedef void *(elevator_init_fn) (struct request_queue *);
31 typedef void (elevator_exit_fn) (struct elevator_queue *);
33 struct elevator_ops
34 {
35 elevator_merge_fn *elevator_merge_fn;
36 elevator_merged_fn *elevator_merged_fn;
37 elevator_merge_req_fn *elevator_merge_req_fn;
38 elevator_allow_merge_fn *elevator_allow_merge_fn;
39
40 elevator_dispatch_fn *elevator_dispatch_fn;
41 elevator_add_req_fn *elevator_add_req_fn;
42 elevator_activate_req_fn *elevator_activate_req_fn;
43 elevator_deactivate_req_fn *elevator_deactivate_req_fn;
44
45 elevator_queue_empty_fn *elevator_queue_empty_fn;
46 elevator_completed_req_fn *elevator_completed_req_fn;
47
48 elevator_request_list_fn *elevator_former_req_fn;
49 elevator_request_list_fn *elevator_latter_req_fn;
50
51 elevator_set_req_fn *elevator_set_req_fn;
52 elevator_put_req_fn *elevator_put_req_fn;
53
54 elevator_may_queue_fn *elevator_may_queue_fn;
55
56 elevator_init_fn *elevator_init_fn;
57 elevator_exit_fn *elevator_exit_fn;
58 void (*trim)(struct io_context *);
59 };
69 /*
70 * identifies an elevator type, such as AS or deadline
71 */
72 struct elevator_type
73 {
74 struct list_head list;
75 struct elevator_ops ops;
76 struct elv_fs_entry *elevator_attrs;
77 char elevator_name[ELV_NAME_MAX];
78 struct module *elevator_owner;
79 };
80
81 /*
82 * each queue has an elevator_queue associated with it
83 */
84 struct elevator_queue
85 {
86 struct elevator_ops *ops;
87 void *elevator_data;
88 struct kobject kobj;
89 struct elevator_type *elevator_type;
90 struct mutex sysfs_lock;
91 struct hlist_head *hash;
92 };
./block/elevator.c
247 int elevator_init(struct request_queue *q, char *name)
248 {
249 struct elevator_type *e = NULL;
250 struct elevator_queue *eq;
251 int ret = 0;
252 void *data;
253 // 初始化请求队列相关属性
254 INIT_LIST_HEAD(&q->queue_head);
255 q->last_merge = NULL;
256 q->end_sector = 0;
257 q->boundary_rq = NULL;
258 // 如果有指定名称的IO调度器,则获取与指定名称匹配的IO调度器
259 if (name) {
260 e = elevator_get(name);
261 if (!e)
262 return -EINVAL;
263 }
264 // 如果没有指定名称的IO调度器,存在可选IO调度器,获取可选IO调度器
265 if (!e && *chosen_elevator) {
266 e = elevator_get(chosen_elevator);
267 if (!e)
268 printk(KERN_ERR "I/O scheduler %s not found\n",
269 chosen_elevator);
270 }
271 // 如果没有指定名称或者可选的IO调度器,获取默认的调度器
272 if (!e) {
273 e = elevator_get(CONFIG_DEFAULT_IOSCHED);
274 if (!e) {
275 printk(KERN_ERR
276 "Default I/O scheduler not found. " \
277 "Using noop.\n");
278 e = elevator_get("noop");
279 }
280 }
281 // 为调度器分配elevator_queue
282 eq = elevator_alloc(q, e);
283 if (!eq)
284 return -ENOMEM;
285 // 从reqest_queue中为调度器数据分配内存
286 data = elevator_init_queue(q, eq);
287 if (!data) {
288 kobject_put(&eq->kobj);
289 return -ENOMEM;
290 }
291 // 关联request_queue,elevator_queue
292 elevator_attach(q, eq, data);
293 return ret;
294 }
295 EXPORT_SYMBOL(elevator_init);
296
297 void elevator_exit(struct elevator_queue *e)
298 {
299 mutex_lock(&e->sysfs_lock);
300 if (e->ops->elevator_exit_fn)
301 e->ops->elevator_exit_fn(e);
302 e->ops = NULL;
303 mutex_unlock(&e->sysfs_lock);
304
305 kobject_put(&e->kobj);
306 }
307 EXPORT_SYMBOL(elevator_exit);
207 static struct elevator_queue *elevator_alloc(struct request_queue *q,
208 struct elevator_type *e)
209 {
210 struct elevator_queue *eq;
211 int i;
212 // 为elevator_queue分配内存
213 eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node);
214 if (unlikely(!eq))
215 goto err;
216 // 使elevator_type与elevator_queue关联
217 eq->ops = &e->ops;
218 eq->elevator_type = e;
219 kobject_init(&eq->kobj, &elv_ktype);
220 mutex_init(&eq->sysfs_lock);
221 // 为elevator_queue hash表分配内存
222 eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
223 GFP_KERNEL, q->node);
224 if (!eq->hash)
225 goto err;
226 // 初始化elevator_queue hash表
227 for (i = 0; i < ELV_HASH_ENTRIES; i++)
228 INIT_HLIST_HEAD(&eq->hash[i]);
229
230 return eq;
231 err:
232 kfree(eq);
233 elevator_put(e);
234 return NULL;
235 }
175 static void *elevator_init_queue(struct request_queue *q,
176 struct elevator_queue *eq)
177 {
178 return eq->ops->elevator_init_fn(q);
179 }
180
181 static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
182 void *data)
183 {
184 q->elevator = eq;
185 eq->elevator_data = data;
186 }
./block/noop-iosched.c
87 static struct elevator_type elevator_noop = {
88 .ops = {
89 .elevator_merge_req_fn = noop_merged_requests,
90 .elevator_dispatch_fn = noop_dispatch,
91 .elevator_add_req_fn = noop_add_request,
92 .elevator_queue_empty_fn = noop_queue_empty,
93 .elevator_former_req_fn = noop_former_request,
94 .elevator_latter_req_fn = noop_latter_request,
95 .elevator_init_fn = noop_init_queue,
96 .elevator_exit_fn = noop_exit_queue,
97 },
98 .elevator_name = "noop",
99 .elevator_owner = THIS_MODULE,
100 };
101
102 static int __init noop_init(void)
103 {
104 elv_register(&elevator_noop);
105
106 return 0;
107 }
108
109 static void __exit noop_exit(void)
110 {
111 elv_unregister(&elevator_noop);
112 }
113
114 module_init(noop_init);
115 module_exit(noop_exit);
68 static void *noop_init_queue(struct request_queue *q)
69 {
70 struct noop_data *nd;
71 // 为noop_data从指点的q->node中分配内存
72 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
73 if (!nd)
74 return NULL;
75 INIT_LIST_HEAD(&nd->queue);
76 return nd;
77 }
10 struct noop_data {
11 struct list_head queue;
12 };
13
14 static void noop_merged_requests(struct request_queue *q, struct request *rq,
15 struct request *next)
16 {
17 list_del_init(&next->queuelist);
18 }
19
20 static int noop_dispatch(struct request_queue *q, int force)
21 {
22 struct noop_data *nd = q->elevator->elevator_data;
23
24 if (!list_empty(&nd->queue)) {
25 struct request *rq;
26 rq = list_entry(nd->queue.next, struct request, queuelist);
27 list_del_init(&rq->queuelist);
28 elv_dispatch_sort(q, rq);
29 return 1;
30 }
31 return 0;
32 }
33
34 static void noop_add_request(struct request_queue *q, struct request *rq)
35 {
36 struct noop_data *nd = q->elevator->elevator_data;
37
38 list_add_tail(&rq->queuelist, &nd->queue);
39 }
40
./include/linux/blkdev.h
153 /*
154 * try to put the fields that are referenced together in the same cacheline.
155 * if you modify this structure, be sure to check block/blk-core.c:rq_init()
156 * as well!
157 */
158 struct request {
159 struct list_head queuelist;
160 struct call_single_data csd;
161 int cpu;
162
163 struct request_queue *q;
164
165 unsigned int cmd_flags;
166 enum rq_cmd_type_bits cmd_type;
167 unsigned long atomic_flags;
168
169 /* the following two fields are internal, NEVER access directly */
170 sector_t __sector; /* sector cursor */
171 unsigned int __data_len; /* total data len */
172
173 struct bio *bio;
174 struct bio *biotail;
175
176 struct hlist_node hash; /* merge hash */
177 /*
178 * The rb_node is only used inside the io scheduler, requests
179 * are pruned when moved to the dispatch queue. So let the
180 * completion_data share space with the rb_node.
181 */
182 union {
183 struct rb_node rb_node; /* sort/lookup */
184 void *completion_data;
185 };
186
187 /*
188 * two pointers are available for the IO schedulers, if they need
189 * more they have to dynamically allocate it.
190 */
191 void *elevator_private;
192 void *elevator_private2;
193
194 struct gendisk *rq_disk;
195 unsigned long start_time;
196
197 /* Number of scatter-gather DMA addr+len pairs after
198 * physical address coalescing is performed.
199 */
200 unsigned short nr_phys_segments;
201
202 unsigned short ioprio;
203
204 void *special; /* opaque pointer available for LLD use */
205 char *buffer; /* kaddr of the current segment if available */
206
207 int tag;
208 int errors;
209
210 int ref_count;
211
212 /*
213 * when request is used as a packet command carrier
214 */
215 unsigned short cmd_len;
216 unsigned char __cmd[BLK_MAX_CDB];
217 unsigned char *cmd;
218
219 unsigned int extra_len; /* length of alignment and padding */
220 unsigned int sense_len;
221 unsigned int resid_len; /* residual count */
222 void *sense;
223
224 unsigned long deadline;
225 struct list_head timeout_list;
226 unsigned int timeout;
227 int retries;
228
229 /*
230 * completion callback.
231 */
232 rq_end_io_fn *end_io;
233 void *end_io_data;
234
235 /* for bidi */
236 struct request *next_rq;
237 };
324 struct request_queue
325 {
326 /*
327 * Together with queue_head for cacheline sharing
328 */
329 struct list_head queue_head;
330 struct request *last_merge;
331 struct elevator_queue *elevator;
332
333 /*
334 * the queue request freelist, one for reads and one for writes
335 */
336 struct request_list rq;
337
338 request_fn_proc *request_fn;
339 make_request_fn *make_request_fn;
340 prep_rq_fn *prep_rq_fn;
341 unplug_fn *unplug_fn;
342 merge_bvec_fn *merge_bvec_fn;
343 prepare_flush_fn *prepare_flush_fn;
344 softirq_done_fn *softirq_done_fn;
345 rq_timed_out_fn *rq_timed_out_fn;
346 dma_drain_needed_fn *dma_drain_needed;
347 lld_busy_fn *lld_busy_fn;
348
349 /*
350 * Dispatch queue sorting
351 */
352 sector_t end_sector;
353 struct request *boundary_rq;
354
355 /*
356 * Auto-unplugging state
357 */
358 struct timer_list unplug_timer;
359 int unplug_thresh; /* After this many requests */
360 unsigned long unplug_delay; /* After this many jiffies */
361 struct work_struct unplug_work;
362
363 struct backing_dev_info backing_dev_info;
364
365 /*
366 * The queue owner gets to use this for whatever they like.
367 * ll_rw_blk doesn't touch it.
368 */
369 void *queuedata;
370
371 /*
372 * queue needs bounce pages for pages above this limit
373 */
374 gfp_t bounce_gfp;
375
376 /*
377 * various queue flags, see QUEUE_* below
378 */
379 unsigned long queue_flags;
380
381 /*
382 * protects queue structures from reentrancy. ->__queue_lock should
383 * _never_ be used directly, it is queue private. always use
384 * ->queue_lock.
385 */
386 spinlock_t __queue_lock;
387 spinlock_t *queue_lock;
388
389 /*
390 * queue kobject
391 */
392 struct kobject kobj;
393
394 /*
395 * queue settings
396 */
397 unsigned long nr_requests; /* Max # of requests */
398 unsigned int nr_congestion_on;
399 unsigned int nr_congestion_off;
400 unsigned int nr_batching;
401
402 void *dma_drain_buffer;
403 unsigned int dma_drain_size;
404 unsigned int dma_pad_mask;
405 unsigned int dma_alignment;
406
407 struct blk_queue_tag *queue_tags;
408 struct list_head tag_busy_list;
409
410 unsigned int nr_sorted;
411 unsigned int in_flight[2];
412
413 unsigned int rq_timeout;
414 struct timer_list timeout;
415 struct list_head timeout_list;
416
417 struct queue_limits limits;
418
419 /*
420 * sg stuff
421 */
422 unsigned int sg_timeout;
423 unsigned int sg_reserved_size;
424 int node;
425 #ifdef CONFIG_BLK_DEV_IO_TRACE
426 struct blk_trace *blk_trace;
427 #endif
428 /*
429 * reserved for flush operations
430 */
431 unsigned int ordered, next_ordered, ordseq;
432 int orderr, ordcolor;
433 struct request pre_flush_rq, bar_rq, post_flush_rq;
434 struct request *orig_bar_rq;
435
436 struct mutex sysfs_lock;
437
438 #if defined(CONFIG_BLK_DEV_BSG)
439 struct bsg_class_device bsg_dev;
440 #endif
441 };
查看当前系统支持的IO调度算法
dmesg | grep -i scheduler
[ 0.207639] io scheduler noop registered
[ 0.207641] io scheduler deadline registered
[ 0.207675] io scheduler cfq registered (default)
查看当前系统的I/O调度方法:
cat /sys/block/xvda/queue/scheduler
noop deadline [cfq]
可以看出cfq为默认的调度方法