BibTex @article{ALSHARID2022102630, title = {Gaze-assisted automatic captioning of fetal ultrasound videos using three-way multi-modal deep neural networks}, journal = {Medical Image Analysis}, volume = {82}, pages = {102630}, year = {2022}, issn = {1361-8415}, doi = {https://doi.org/10.1016/j.media.2022.102630}, url = {https://www.sciencedirect.com/science/article/pii/S1361841522002584}, author = {Mohammad Alsharid and Yifan Cai and Harshita Sharma and Lior Drukker and Aris T. Papageorghiou and J. Alison Noble}, keywords = {Video captioning, Gaze tracking, Fetal ultrasound, Audio–visual, Multi-modal}, abstract = {In this work, we present a novel gaze-assisted natural language processing (NLP)-based video captioning model to describe routine second-trimester fetal ultrasound scan videos in a vocabulary of spoken sonography.
We present a novel multi-task neural network called Temporal SonoEyeNet (TSEN) with a primary task to describe the visual navigation process of sonographers by learning to generate visual attention maps of ultrasound images around standard biometry …
Gaze tracking is a promising technology for studying the visual perception of clinicians during image-based medical exams. It could be used in longitudinal studies to analyze their perceptive process, explore human-machine interactions, and develop …
Gaze tracking is a promising technology for studying the visual perception of clinicians during image-based medical exams. It could be used in longitudinal studies to analyze their perceptive process, explore human-machine interactions, and develop …
For visual tasks like ultrasound (US) scanning, experts direct their gaze towards regions of task-relevant information. Therefore, learning to predict the gaze of sonographers on US videos captures the spatio-temporal patterns that are important for …
Image representations are commonly learned from class labels, which are a simplistic approximation of human image understanding. In this paper we demonstrate that transferable representations of images can be learned without manual annotations by …
Recent automated medical image analysis methods have attained state-of-the-art performance but have relied on memory and compute-intensive deep learning models. Reducing model size without significant loss in performance metrics is crucial for time …
We present a novel multi-task convolutional neural network called Multi-task SonoEyeNet (M-SEN) that learns to generate clinically relevant visual attention maps using sonographer gaze tracking data on input ultrasound (US) video frames so as to …